1 files changed, 3 insertions, 162117 deletions
diff --git a/debian/patches/gcc-linaro.diff b/debian/patches/gcc-linaro.diff
index c2e8235..ae1c251 100644
--- a/debian/patches/gcc-linaro.diff
+++ b/debian/patches/gcc-linaro.diff
@@ -1,162123 +1,9 @@
-# DP: Changes for the Linaro 6-2017.03 release.
+# DP: Changes for the Linaro 8-2018.xx snapshot.
 
-MSG=$(git log origin/linaro/gcc-6-branch --format=format:"%s" -n 1 --grep "Merge branches"); SVN=${MSG##* }; git log origin/gcc-6-branch --format=format:"%H" -n 1 --grep "gcc-6-branch@${SVN%.}"
+MSG=$(git log origin/linaro/gcc-8-branch --format=format:"%s" -n 1 --grep "Merge branches"); SVN=${MSG##* }; git log origin/gcc-7-branch --format=format:"%H" -n 1 --grep "gcc-7-branch@${SVN%.}"
 
-LANG=C git diff --no-renames 4b7882c54dabbb54686cb577f2a2cf28e93e743b..630c5507bb37d2caaef60a6f0773e4c820d76fe0 \
+LANG=C git diff --no-renames bb85d61e6bfbadee4494e034a5d8187cf0626aed 1604249e382610b087a72d0d07103f815458cec0 \
  | egrep -v '^(diff|index) ' \
  | filterdiff --strip=1 --addoldprefix=a/src/  --addnewprefix=b/src/ \
  | sed 's,a/src//dev/null,/dev/null,'
 
---- a/src/contrib/compare_tests
-+++ b/src/contrib/compare_tests
-@@ -107,8 +107,8 @@ elif [ -d "$1" -o -d "$2" ] ; then
- 	usage "Must specify either two directories or two files"
- fi
- 
--sed 's/^XFAIL/FAIL/; s/^XPASS/PASS/' < "$1" | awk '/^Running target / {target = $3} { if (target != "unix") { sub(/: /, "&"target": " ); }; print $0; }' | cut -c1-2000 >$tmp1
--sed 's/^XFAIL/FAIL/; s/^XPASS/PASS/' < "$2" | awk '/^Running target / {target = $3} { if (target != "unix") { sub(/: /, "&"target": " ); }; print $0; }' | cut -c1-2000 >$tmp2
-+sed 's/^XFAIL/FAIL/; s/^ERROR/FAIL/; s/^XPASS/PASS/' < "$1" | awk '/^Running target / {target = $3} { if (target != "unix") { sub(/: /, "&"target": " ); }; print $0; }' | cut -c1-2000 >$tmp1
-+sed 's/^XFAIL/FAIL/; s/^ERROR/FAIL/; s/^XPASS/PASS/' < "$2" | awk '/^Running target / {target = $3} { if (target != "unix") { sub(/: /, "&"target": " ); }; print $0; }' | cut -c1-2000 >$tmp2
- 
- before=$tmp1
- now=$tmp2
---- a/src/contrib/dg-extract-results.py
-+++ b/src/contrib/dg-extract-results.py
-@@ -134,6 +134,7 @@ class Prog:
-         self.end_line = None
-         # Known summary types.
-         self.count_names = [
-+            '# of DejaGnu errors\t\t',
-             '# of expected passes\t\t',
-             '# of unexpected failures\t',
-             '# of unexpected successes\t',
-@@ -245,6 +246,10 @@ class Prog:
-             segment = Segment (filename, file.tell())
-             variation.header = segment
- 
-+        # Parse the rest of the summary (the '# of ' lines).
-+        if len (variation.counts) == 0:
-+            variation.counts = self.zero_counts()
-+
-         # Parse up until the first line of the summary.
-         if num_variations == 1:
-             end = '\t\t=== ' + tool.name + ' Summary ===\n'
-@@ -291,6 +296,11 @@ class Prog:
-                 harness.results.append ((key, line))
-                 if not first_key and sort_logs:
-                     first_key = key
-+                if line.startswith ('ERROR: (DejaGnu)'):
-+                    for i in range (len (self.count_names)):
-+                        if 'DejaGnu errors' in self.count_names[i]:
-+                            variation.counts[i] += 1
-+                            break
- 
-             # 'Using ...' lines are only interesting in a header.  Splitting
-             # the test up into parallel runs leads to more 'Using ...' lines
-@@ -309,9 +319,6 @@ class Prog:
-             segment.lines -= final_using
-             harness.add_segment (first_key, segment)
- 
--        # Parse the rest of the summary (the '# of ' lines).
--        if len (variation.counts) == 0:
--            variation.counts = self.zero_counts()
-         while True:
-             before = file.tell()
-             line = file.readline()
---- a/src/contrib/dg-extract-results.sh
-+++ b/src/contrib/dg-extract-results.sh
-@@ -369,10 +369,11 @@ EOF
- BEGIN {
-   variant="$VAR"
-   tool="$TOOL"
--  passcnt=0; failcnt=0; untstcnt=0; xpasscnt=0; xfailcnt=0; kpasscnt=0; kfailcnt=0; unsupcnt=0; unrescnt=0;
-+  passcnt=0; failcnt=0; untstcnt=0; xpasscnt=0; xfailcnt=0; kpasscnt=0; kfailcnt=0; unsupcnt=0; unrescnt=0; dgerrorcnt=0;
-   curvar=""; insummary=0
- }
- /^Running target /		{ curvar = \$3; next }
-+/^ERROR: \(DejaGnu\)/		{ if (variant == curvar) dgerrorcnt += 1 }
- /^# of /			{ if (variant == curvar) insummary = 1 }
- /^# of expected passes/		{ if (insummary == 1) passcnt += \$5; next; }
- /^# of unexpected successes/	{ if (insummary == 1) xpasscnt += \$5; next; }
-@@ -390,6 +391,7 @@ BEGIN {
- { next }
- END {
-   printf ("\t\t=== %s Summary for %s ===\n\n", tool, variant)
-+  if (dgerrorcnt != 0) printf ("# of DejaGnu errors\t\t%d\n", dgerrorcnt)
-   if (passcnt != 0) printf ("# of expected passes\t\t%d\n", passcnt)
-   if (failcnt != 0) printf ("# of unexpected failures\t%d\n", failcnt)
-   if (xpasscnt != 0) printf ("# of unexpected successes\t%d\n", xpasscnt)
-@@ -419,8 +421,9 @@ TOTAL_AWK=${TMP}/total.awk
- cat << EOF > $TOTAL_AWK
- BEGIN {
-   tool="$TOOL"
--  passcnt=0; failcnt=0; untstcnt=0; xpasscnt=0; xfailcnt=0; kfailcnt=0; unsupcnt=0; unrescnt=0
-+  passcnt=0; failcnt=0; untstcnt=0; xpasscnt=0; xfailcnt=0; kfailcnt=0; unsupcnt=0; unrescnt=0; dgerrorcnt=0
- }
-+/^# of DejaGnu errors/		{ dgerrorcnt += \$5 }
- /^# of expected passes/		{ passcnt += \$5 }
- /^# of unexpected failures/	{ failcnt += \$5 }
- /^# of unexpected successes/	{ xpasscnt += \$5 }
-@@ -431,7 +434,8 @@ BEGIN {
- /^# of unresolved testcases/	{ unrescnt += \$5 }
- /^# of unsupported tests/	{ unsupcnt += \$5 }
- END {
--  printf ("\n\t\t=== %s Summary ===\n\n", tool)
-+  printf ("\n\t\t=== %s MySummary ===\n\n", tool)
-+  if (dgerrorcnt != 0) printf ("# of DejaGnu errors\t\t%d\n", dgerrorcnt)
-   if (passcnt != 0) printf ("# of expected passes\t\t%d\n", passcnt)
-   if (failcnt != 0) printf ("# of unexpected failures\t%d\n", failcnt)
-   if (xpasscnt != 0) printf ("# of unexpected successes\t%d\n", xpasscnt)
---- /dev/null
-+++ b/src/gcc/LINARO-VERSION
-@@ -0,0 +1 @@
-+Snapshot 6.3-2017.03
---- a/src/gcc/Makefile.in
-+++ b/src/gcc/Makefile.in
-@@ -832,10 +832,12 @@ BASEVER     := $(srcdir)/BASE-VER  # 4.x.y
- DEVPHASE    := $(srcdir)/DEV-PHASE # experimental, prerelease, ""
- DATESTAMP   := $(srcdir)/DATESTAMP # YYYYMMDD or empty
- REVISION    := $(srcdir)/REVISION  # [BRANCH revision XXXXXX]
-+LINAROVER   := $(srcdir)/LINARO-VERSION # M.x-YYYY.MM[-S][~dev]
- 
- BASEVER_c   := $(shell cat $(BASEVER))
- DEVPHASE_c  := $(shell cat $(DEVPHASE))
- DATESTAMP_c := $(shell cat $(DATESTAMP))
-+LINAROVER_c := $(shell cat $(LINAROVER))
- 
- ifeq (,$(wildcard $(REVISION)))
- REVISION_c  :=
-@@ -862,6 +864,7 @@ DATESTAMP_s := \
-   "\"$(if $(DEVPHASE_c)$(filter-out 0,$(PATCHLEVEL_c)), $(DATESTAMP_c))\""
- PKGVERSION_s:= "\"@PKGVERSION@\""
- BUGURL_s    := "\"@REPORT_BUGS_TO@\""
-+LINAROVER_s := "\"$(LINAROVER_c)\""
- 
- PKGVERSION  := @PKGVERSION@
- BUGURL_TEXI := @REPORT_BUGS_TEXI@
-@@ -2701,8 +2704,9 @@ PREPROCESSOR_DEFINES = \
-   -DSTANDARD_EXEC_PREFIX=\"$(libdir)/gcc/\" \
-   @TARGET_SYSTEM_ROOT_DEFINE@
- 
--CFLAGS-cppbuiltin.o += $(PREPROCESSOR_DEFINES) -DBASEVER=$(BASEVER_s)
--cppbuiltin.o: $(BASEVER)
-+CFLAGS-cppbuiltin.o += $(PREPROCESSOR_DEFINES) -DBASEVER=$(BASEVER_s) \
-+	-DLINAROVER=$(LINAROVER_s)
-+cppbuiltin.o: $(BASEVER) $(LINAROVER)
- 
- CFLAGS-cppdefault.o += $(PREPROCESSOR_DEFINES)
- 
---- a/src/gcc/ada/gcc-interface/misc.c
-+++ b/src/gcc/ada/gcc-interface/misc.c
-@@ -255,8 +255,7 @@ static bool
- gnat_post_options (const char **pfilename ATTRIBUTE_UNUSED)
- {
-   /* Excess precision other than "fast" requires front-end support.  */
--  if (flag_excess_precision_cmdline == EXCESS_PRECISION_STANDARD
--      && TARGET_FLT_EVAL_METHOD_NON_DEFAULT)
-+  if (flag_excess_precision_cmdline == EXCESS_PRECISION_STANDARD)
-     sorry ("-fexcess-precision=standard for Ada");
-   flag_excess_precision_cmdline = EXCESS_PRECISION_FAST;
- 
---- a/src/gcc/builtins.c
-+++ b/src/gcc/builtins.c
-@@ -28,6 +28,7 @@ along with GCC; see the file COPYING3.  If not see
- #include "target.h"
- #include "rtl.h"
- #include "tree.h"
-+#include "memmodel.h"
- #include "gimple.h"
- #include "predict.h"
- #include "tm_p.h"
---- a/src/gcc/c-family/c-common.c
-+++ b/src/gcc/c-family/c-common.c
-@@ -25,6 +25,7 @@ along with GCC; see the file COPYING3.  If not see
- #include "target.h"
- #include "function.h"
- #include "tree.h"
-+#include "memmodel.h"
- #include "c-common.h"
- #include "gimple-expr.h"
- #include "tm_p.h"
---- a/src/gcc/c-family/c-opts.c
-+++ b/src/gcc/c-family/c-opts.c
-@@ -772,8 +772,7 @@ c_common_post_options (const char **pfilename)
-      support.  */
-   if (c_dialect_cxx ())
-     {
--      if (flag_excess_precision_cmdline == EXCESS_PRECISION_STANDARD
--	  && TARGET_FLT_EVAL_METHOD_NON_DEFAULT)
-+      if (flag_excess_precision_cmdline == EXCESS_PRECISION_STANDARD)
- 	sorry ("-fexcess-precision=standard for C++");
-       flag_excess_precision_cmdline = EXCESS_PRECISION_FAST;
-     }
---- a/src/gcc/calls.c
-+++ b/src/gcc/calls.c
-@@ -194,10 +194,19 @@ prepare_call_address (tree fndecl_or_type, rtx funexp, rtx static_chain_value,
- 	       && targetm.small_register_classes_for_mode_p (FUNCTION_MODE))
- 	      ? force_not_mem (memory_address (FUNCTION_MODE, funexp))
- 	      : memory_address (FUNCTION_MODE, funexp));
--  else if (! sibcallp)
-+  else
-     {
--      if (!NO_FUNCTION_CSE && optimize && ! flag_no_function_cse)
--	funexp = force_reg (Pmode, funexp);
-+      /* funexp could be a SYMBOL_REF represents a function pointer which is
-+	 of ptr_mode.  In this case, it should be converted into address mode
-+	 to be a valid address for memory rtx pattern.  See PR 64971.  */
-+      if (GET_MODE (funexp) != Pmode)
-+	funexp = convert_memory_address (Pmode, funexp);
-+
-+      if (! sibcallp)
-+	{
-+	  if (!NO_FUNCTION_CSE && optimize && ! flag_no_function_cse)
-+	    funexp = force_reg (Pmode, funexp);
-+	}
-     }
- 
-   if (static_chain_value != 0
---- a/src/gcc/cfg.c
-+++ b/src/gcc/cfg.c
-@@ -1064,7 +1064,7 @@ free_original_copy_tables (void)
-   delete bb_copy;
-   bb_copy = NULL;
-   delete bb_original;
--  bb_copy = NULL;
-+  bb_original = NULL;
-   delete loop_copy;
-   loop_copy = NULL;
-   delete original_copy_bb_pool;
---- a/src/gcc/common/config/arm/arm-common.c
-+++ b/src/gcc/common/config/arm/arm-common.c
-@@ -97,6 +97,49 @@ arm_rewrite_mcpu (int argc, const char **argv)
-   return arm_rewrite_selected_cpu (argv[argc - 1]);
- }
- 
-+struct arm_arch_core_flag
-+{
-+  const char *const name;
-+  const arm_feature_set flags;
-+};
-+
-+static const struct arm_arch_core_flag arm_arch_core_flags[] =
-+{
-+#undef ARM_CORE
-+#define ARM_CORE(NAME, X, IDENT, ARCH, FLAGS, COSTS) \
-+  {NAME, FLAGS},
-+#include "config/arm/arm-cores.def"
-+#undef ARM_CORE
-+#undef ARM_ARCH
-+#define ARM_ARCH(NAME, CORE, ARCH, FLAGS) \
-+  {NAME, FLAGS},
-+#include "config/arm/arm-arches.def"
-+#undef ARM_ARCH
-+};
-+
-+/* Called by the driver to check whether the target denoted by current
-+   command line options is a Thumb-only target.  ARGV is an array of
-+   -march and -mcpu values (ie. it contains the rhs after the equal
-+   sign) and we use the last one of them to make a decision.  The
-+   number of elements in ARGV is given in ARGC.  */
-+const char *
-+arm_target_thumb_only (int argc, const char **argv)
-+{
-+  unsigned int opt;
-+
-+  if (argc)
-+    {
-+      for (opt = 0; opt < (ARRAY_SIZE (arm_arch_core_flags)); opt++)
-+	if ((strcmp (argv[argc - 1], arm_arch_core_flags[opt].name) == 0)
-+	    && !ARM_FSET_HAS_CPU1(arm_arch_core_flags[opt].flags, FL_NOTM))
-+	  return "-mthumb";
-+
-+      return NULL;
-+    }
-+  else
-+    return NULL;
-+}
-+
- #undef ARM_CPU_NAME_LENGTH
- 
- 
---- a/src/gcc/config.gcc
-+++ b/src/gcc/config.gcc
-@@ -307,7 +307,7 @@ m32c*-*-*)
-         ;;
- aarch64*-*-*)
- 	cpu_type=aarch64
--	extra_headers="arm_neon.h arm_acle.h"
-+	extra_headers="arm_fp16.h arm_neon.h arm_acle.h"
- 	c_target_objs="aarch64-c.o"
- 	cxx_target_objs="aarch64-c.o"
- 	extra_objs="aarch64-builtins.o aarch-common.o cortex-a57-fma-steering.o"
-@@ -327,7 +327,7 @@ arc*-*-*)
- arm*-*-*)
- 	cpu_type=arm
- 	extra_objs="arm-builtins.o aarch-common.o"
--	extra_headers="mmintrin.h arm_neon.h arm_acle.h"
-+	extra_headers="mmintrin.h arm_neon.h arm_acle.h arm_fp16.h arm_cmse.h"
- 	target_type_format_char='%'
- 	c_target_objs="arm-c.o"
- 	cxx_target_objs="arm-c.o"
-@@ -1500,7 +1500,7 @@ i[34567]86-*-linux* | i[34567]86-*-kfreebsd*-gnu | i[34567]86-*-knetbsd*-gnu | i
- 		extra_options="${extra_options} linux-android.opt"
- 		# Assume modern glibc if not targeting Android nor uclibc.
- 		case ${target} in
--		*-*-*android*|*-*-*uclibc*)
-+		*-*-*android*|*-*-*uclibc*|*-*-*musl*)
- 		  ;;
- 		*)
- 		  default_gnu_indirect_function=yes
-@@ -1569,7 +1569,7 @@ x86_64-*-linux* | x86_64-*-kfreebsd*-gnu | x86_64-*-knetbsd*-gnu)
- 		extra_options="${extra_options} linux-android.opt"
- 		# Assume modern glibc if not targeting Android nor uclibc.
- 		case ${target} in
--		*-*-*android*|*-*-*uclibc*)
-+		*-*-*android*|*-*-*uclibc*|*-*-*musl*)
- 		  ;;
- 		*)
- 		  default_gnu_indirect_function=yes
-@@ -3811,38 +3811,51 @@ case "${target}" in
- 		# Add extra multilibs
- 		if test "x$with_multilib_list" != x; then
- 			arm_multilibs=`echo $with_multilib_list | sed -e 's/,/ /g'`
--			for arm_multilib in ${arm_multilibs}; do
--				case ${arm_multilib} in
--				aprofile)
-+			case ${arm_multilibs} in
-+			aprofile)
- 				# Note that arm/t-aprofile is a
- 				# stand-alone make file fragment to be
- 				# used only with itself.  We do not
- 				# specifically use the
- 				# TM_MULTILIB_OPTION framework because
- 				# this shorthand is more
--				# pragmatic. Additionally it is only
--				# designed to work without any
--				# with-cpu, with-arch with-mode
--				# with-fpu or with-float options.
--					if test "x$with_arch" != x \
--					    || test "x$with_cpu" != x \
--					    || test "x$with_float" != x \
--					    || test "x$with_fpu" != x \
--					    || test "x$with_mode" != x ; then
--					    echo "Error: You cannot use any of --with-arch/cpu/fpu/float/mode with --with-multilib-list=aprofile" 1>&2
--					    exit 1
--					fi
--					tmake_file="${tmake_file} arm/t-aprofile"
--					break
--					;;
--				default)
--					;;
--				*)
--					echo "Error: --with-multilib-list=${with_multilib_list} not supported." 1>&2
--					exit 1
--					;;
--				esac
--			done
-+				# pragmatic.
-+				tmake_profile_file="arm/t-aprofile"
-+				;;
-+			rmprofile)
-+				# Note that arm/t-rmprofile is a
-+				# stand-alone make file fragment to be
-+				# used only with itself.  We do not
-+				# specifically use the
-+				# TM_MULTILIB_OPTION framework because
-+				# this shorthand is more
-+				# pragmatic.
-+				tmake_profile_file="arm/t-rmprofile"
-+				;;
-+			default)
-+				;;
-+			*)
-+				echo "Error: --with-multilib-list=${with_multilib_list} not supported." 1>&2
-+				exit 1
-+				;;
-+			esac
-+
-+			if test "x${tmake_profile_file}" != x ; then
-+				# arm/t-aprofile and arm/t-rmprofile are only
-+				# designed to work without any with-cpu,
-+				# with-arch, with-mode, with-fpu or with-float
-+				# options.
-+				if test "x$with_arch" != x \
-+				    || test "x$with_cpu" != x \
-+				    || test "x$with_float" != x \
-+				    || test "x$with_fpu" != x \
-+				    || test "x$with_mode" != x ; then
-+				    echo "Error: You cannot use any of --with-arch/cpu/fpu/float/mode with --with-multilib-list=${with_multilib_list}" 1>&2
-+				    exit 1
-+				fi
-+
-+				tmake_file="${tmake_file} ${tmake_profile_file}"
-+			fi
- 		fi
- 		;;
- 
---- a/src/gcc/config/aarch64/aarch64-arches.def
-+++ b/src/gcc/config/aarch64/aarch64-arches.def
-@@ -32,4 +32,6 @@
- 
- AARCH64_ARCH("armv8-a",	      generic,	     8A,	8,  AARCH64_FL_FOR_ARCH8)
- AARCH64_ARCH("armv8.1-a",     generic,	     8_1A,	8,  AARCH64_FL_FOR_ARCH8_1)
-+AARCH64_ARCH("armv8.2-a",     generic,	     8_2A,	8,  AARCH64_FL_FOR_ARCH8_2)
-+AARCH64_ARCH("armv8.3-a",     generic,	     8_3A,	8,  AARCH64_FL_FOR_ARCH8_3)
- 
---- a/src/gcc/config/aarch64/aarch64-builtins.c
-+++ b/src/gcc/config/aarch64/aarch64-builtins.c
-@@ -62,6 +62,7 @@
- #define si_UP    SImode
- #define sf_UP    SFmode
- #define hi_UP    HImode
-+#define hf_UP    HFmode
- #define qi_UP    QImode
- #define UP(X) X##_UP
- 
-@@ -139,6 +140,10 @@ aarch64_types_binop_ssu_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-   = { qualifier_none, qualifier_none, qualifier_unsigned };
- #define TYPES_BINOP_SSU (aarch64_types_binop_ssu_qualifiers)
- static enum aarch64_type_qualifiers
-+aarch64_types_binop_uss_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-+  = { qualifier_unsigned, qualifier_none, qualifier_none };
-+#define TYPES_BINOP_USS (aarch64_types_binop_uss_qualifiers)
-+static enum aarch64_type_qualifiers
- aarch64_types_binopp_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-   = { qualifier_poly, qualifier_poly, qualifier_poly };
- #define TYPES_BINOPP (aarch64_types_binopp_qualifiers)
-@@ -164,6 +169,10 @@ aarch64_types_quadop_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
- #define TYPES_QUADOP_LANE (aarch64_types_quadop_lane_qualifiers)
- 
- static enum aarch64_type_qualifiers
-+aarch64_types_binop_imm_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-+  = { qualifier_poly, qualifier_none, qualifier_immediate };
-+#define TYPES_GETREGP (aarch64_types_binop_imm_p_qualifiers)
-+static enum aarch64_type_qualifiers
- aarch64_types_binop_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-   = { qualifier_none, qualifier_none, qualifier_immediate };
- #define TYPES_GETREG (aarch64_types_binop_imm_qualifiers)
-@@ -173,16 +182,29 @@ aarch64_types_shift_to_unsigned_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-   = { qualifier_unsigned, qualifier_none, qualifier_immediate };
- #define TYPES_SHIFTIMM_USS (aarch64_types_shift_to_unsigned_qualifiers)
- static enum aarch64_type_qualifiers
-+aarch64_types_fcvt_from_unsigned_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-+  = { qualifier_none, qualifier_unsigned, qualifier_immediate };
-+#define TYPES_FCVTIMM_SUS (aarch64_types_fcvt_from_unsigned_qualifiers)
-+static enum aarch64_type_qualifiers
- aarch64_types_unsigned_shift_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-   = { qualifier_unsigned, qualifier_unsigned, qualifier_immediate };
- #define TYPES_USHIFTIMM (aarch64_types_unsigned_shift_qualifiers)
- 
- static enum aarch64_type_qualifiers
--aarch64_types_ternop_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
--  = { qualifier_none, qualifier_none, qualifier_none, qualifier_immediate };
--#define TYPES_SETREG (aarch64_types_ternop_imm_qualifiers)
--#define TYPES_SHIFTINSERT (aarch64_types_ternop_imm_qualifiers)
--#define TYPES_SHIFTACC (aarch64_types_ternop_imm_qualifiers)
-+aarch64_types_ternop_s_imm_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-+  = { qualifier_none, qualifier_none, qualifier_poly, qualifier_immediate};
-+#define TYPES_SETREGP (aarch64_types_ternop_s_imm_p_qualifiers)
-+static enum aarch64_type_qualifiers
-+aarch64_types_ternop_s_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-+  = { qualifier_none, qualifier_none, qualifier_none, qualifier_immediate};
-+#define TYPES_SETREG (aarch64_types_ternop_s_imm_qualifiers)
-+#define TYPES_SHIFTINSERT (aarch64_types_ternop_s_imm_qualifiers)
-+#define TYPES_SHIFTACC (aarch64_types_ternop_s_imm_qualifiers)
-+
-+static enum aarch64_type_qualifiers
-+aarch64_types_ternop_p_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-+  = { qualifier_poly, qualifier_poly, qualifier_poly, qualifier_immediate};
-+#define TYPES_SHIFTINSERTP (aarch64_types_ternop_p_imm_qualifiers)
- 
- static enum aarch64_type_qualifiers
- aarch64_types_unsigned_shiftacc_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-@@ -197,6 +219,11 @@ aarch64_types_combine_qualifiers[SIMD_MAX_BUILTIN_ARGS]
- #define TYPES_COMBINE (aarch64_types_combine_qualifiers)
- 
- static enum aarch64_type_qualifiers
-+aarch64_types_combine_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-+  = { qualifier_poly, qualifier_poly, qualifier_poly };
-+#define TYPES_COMBINEP (aarch64_types_combine_p_qualifiers)
-+
-+static enum aarch64_type_qualifiers
- aarch64_types_load1_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-   = { qualifier_none, qualifier_const_pointer_map_mode };
- #define TYPES_LOAD1 (aarch64_types_load1_qualifiers)
-@@ -229,6 +256,10 @@ aarch64_types_bsl_u_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-    qualifier_map_mode | qualifier_pointer to build a pointer to the
-    element type of the vector.  */
- static enum aarch64_type_qualifiers
-+aarch64_types_store1_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-+  = { qualifier_void, qualifier_pointer_map_mode, qualifier_poly };
-+#define TYPES_STORE1P (aarch64_types_store1_p_qualifiers)
-+static enum aarch64_type_qualifiers
- aarch64_types_store1_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-   = { qualifier_void, qualifier_pointer_map_mode, qualifier_none };
- #define TYPES_STORE1 (aarch64_types_store1_qualifiers)
-@@ -753,16 +784,16 @@ aarch64_init_simd_builtins (void)
- 
- 	  if (qualifiers & qualifier_unsigned)
- 	    {
--	      type_signature[arg_num] = 'u';
-+	      type_signature[op_num] = 'u';
- 	      print_type_signature_p = true;
- 	    }
- 	  else if (qualifiers & qualifier_poly)
- 	    {
--	      type_signature[arg_num] = 'p';
-+	      type_signature[op_num] = 'p';
- 	      print_type_signature_p = true;
- 	    }
- 	  else
--	    type_signature[arg_num] = 's';
-+	    type_signature[op_num] = 's';
- 
- 	  /* Skip an internal operand for vget_{low, high}.  */
- 	  if (qualifiers & qualifier_internal)
---- a/src/gcc/config/aarch64/aarch64-c.c
-+++ b/src/gcc/config/aarch64/aarch64-c.c
-@@ -95,6 +95,11 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
-   else
-     cpp_undef (pfile, "__ARM_FP");
- 
-+  aarch64_def_or_undef (TARGET_FP_F16INST,
-+			"__ARM_FEATURE_FP16_SCALAR_ARITHMETIC", pfile);
-+  aarch64_def_or_undef (TARGET_SIMD_F16INST,
-+			"__ARM_FEATURE_FP16_VECTOR_ARITHMETIC", pfile);
-+
-   aarch64_def_or_undef (TARGET_SIMD, "__ARM_FEATURE_NUMERIC_MAXMIN", pfile);
-   aarch64_def_or_undef (TARGET_SIMD, "__ARM_NEON", pfile);
- 
---- a/src/gcc/config/aarch64/aarch64-cores.def
-+++ b/src/gcc/config/aarch64/aarch64-cores.def
-@@ -40,17 +40,33 @@
- 
- /* V8 Architecture Processors.  */
- 
-+/* ARM ('A') cores. */
- AARCH64_CORE("cortex-a35",  cortexa35, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa35, "0x41", "0xd04")
- AARCH64_CORE("cortex-a53",  cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa53, "0x41", "0xd03")
- AARCH64_CORE("cortex-a57",  cortexa57, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, "0x41", "0xd07")
- AARCH64_CORE("cortex-a72",  cortexa72, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa72, "0x41", "0xd08")
-+AARCH64_CORE("cortex-a73",  cortexa73, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa73, "0x41", "0xd09")
-+
-+/* Samsung ('S') cores. */
- AARCH64_CORE("exynos-m1",   exynosm1,  exynosm1,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, exynosm1,  "0x53", "0x001")
--AARCH64_CORE("qdf24xx",     qdf24xx,   cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, cortexa57, "0x51", "0x800")
-+
-+/* Qualcomm ('Q') cores. */
-+AARCH64_CORE("qdf24xx",     qdf24xx,   cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, qdf24xx,   "0x51", "0x800")
-+
-+/* Cavium ('C') cores. */
- AARCH64_CORE("thunderx",    thunderx,  thunderx,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderx,  "0x43", "0x0a1")
-+
-+/* APM ('P') cores. */
- AARCH64_CORE("xgene1",      xgene1,    xgene1,    8A,  AARCH64_FL_FOR_ARCH8, xgene1, "0x50", "0x000")
- 
-+/* V8.1 Architecture Processors.  */
-+
-+/* Broadcom ('B') cores. */
-+AARCH64_CORE("vulcan",  vulcan, cortexa57, 8_1A,  AARCH64_FL_FOR_ARCH8_1 | AARCH64_FL_CRYPTO, vulcan, "0x42", "0x516")
-+
- /* V8 big.LITTLE implementations.  */
- 
- AARCH64_CORE("cortex-a57.cortex-a53",  cortexa57cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, "0x41", "0xd07.0xd03")
- AARCH64_CORE("cortex-a72.cortex-a53",  cortexa72cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa72, "0x41", "0xd08.0xd03")
--
-+AARCH64_CORE("cortex-a73.cortex-a35",  cortexa73cortexa35, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa73, "0x41", "0xd09.0xd04")
-+AARCH64_CORE("cortex-a73.cortex-a53",  cortexa73cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa73, "0x41", "0xd09.0xd03")
---- a/src/gcc/config/aarch64/aarch64-cost-tables.h
-+++ b/src/gcc/config/aarch64/aarch64-cost-tables.h
-@@ -127,6 +127,108 @@ const struct cpu_cost_table thunderx_extra_costs =
-   }
- };
- 
-+const struct cpu_cost_table vulcan_extra_costs =
-+{
-+  /* ALU */
-+  {
-+    0,			/* Arith.  */
-+    0,			/* Logical.  */
-+    0,			/* Shift.  */
-+    0,			/* Shift_reg.  */
-+    COSTS_N_INSNS (1),	/* Arith_shift.  */
-+    COSTS_N_INSNS (1),	/* Arith_shift_reg.  */
-+    COSTS_N_INSNS (1),	/* Log_shift.  */
-+    COSTS_N_INSNS (1),	/* Log_shift_reg.  */
-+    0,			/* Extend.  */
-+    COSTS_N_INSNS (1),	/* Extend_arith.  */
-+    0,			/* Bfi.  */
-+    0,			/* Bfx.  */
-+    COSTS_N_INSNS (3),	/* Clz.  */
-+    0,			/* Rev.  */
-+    0,			/* Non_exec.  */
-+    true		/* Non_exec_costs_exec.  */
-+  },
-+  {
-+    /* MULT SImode */
-+    {
-+      COSTS_N_INSNS (4),	/* Simple.  */
-+      COSTS_N_INSNS (4),	/* Flag_setting.  */
-+      COSTS_N_INSNS (4),	/* Extend.  */
-+      COSTS_N_INSNS (5),	/* Add.  */
-+      COSTS_N_INSNS (5),	/* Extend_add.  */
-+      COSTS_N_INSNS (18)	/* Idiv.  */
-+    },
-+    /* MULT DImode */
-+    {
-+      COSTS_N_INSNS (4),       /* Simple.  */
-+      0,                       /* Flag_setting.  */
-+      COSTS_N_INSNS (4),       /* Extend.  */
-+      COSTS_N_INSNS (5),       /* Add.  */
-+      COSTS_N_INSNS (5),       /* Extend_add.  */
-+      COSTS_N_INSNS (26)       /* Idiv.  */
-+    }
-+  },
-+  /* LD/ST */
-+  {
-+    COSTS_N_INSNS (4),	/* Load.  */
-+    COSTS_N_INSNS (4),	/* Load_sign_extend.  */
-+    COSTS_N_INSNS (5),	/* Ldrd.  */
-+    COSTS_N_INSNS (4),	/* Ldm_1st.  */
-+    1,			/* Ldm_regs_per_insn_1st.  */
-+    1,			/* Ldm_regs_per_insn_subsequent.  */
-+    COSTS_N_INSNS (4),	/* Loadf.  */
-+    COSTS_N_INSNS (4),	/* Loadd.  */
-+    COSTS_N_INSNS (4),	/* Load_unaligned.  */
-+    0,			/* Store.  */
-+    0,			/* Strd.  */
-+    0,			/* Stm_1st.  */
-+    1,			/* Stm_regs_per_insn_1st.  */
-+    1,			/* Stm_regs_per_insn_subsequent.  */
-+    0,			/* Storef.  */
-+    0,			/* Stored.  */
-+    0,			/* Store_unaligned.  */
-+    COSTS_N_INSNS (1),	/* Loadv.  */
-+    COSTS_N_INSNS (1)	/* Storev.  */
-+  },
-+  {
-+    /* FP SFmode */
-+    {
-+      COSTS_N_INSNS (4),	/* Div.  */
-+      COSTS_N_INSNS (1),	/* Mult.  */
-+      COSTS_N_INSNS (1),	/* Mult_addsub. */
-+      COSTS_N_INSNS (1),	/* Fma.  */
-+      COSTS_N_INSNS (1),	/* Addsub.  */
-+      COSTS_N_INSNS (1),	/* Fpconst. */
-+      COSTS_N_INSNS (1),	/* Neg.  */
-+      COSTS_N_INSNS (1),	/* Compare.  */
-+      COSTS_N_INSNS (2),	/* Widen.  */
-+      COSTS_N_INSNS (2),	/* Narrow.  */
-+      COSTS_N_INSNS (2),	/* Toint.  */
-+      COSTS_N_INSNS (2),	/* Fromint.  */
-+      COSTS_N_INSNS (2) 	/* Roundint.  */
-+    },
-+    /* FP DFmode */
-+    {
-+      COSTS_N_INSNS (6),	/* Div.  */
-+      COSTS_N_INSNS (1),	/* Mult.  */
-+      COSTS_N_INSNS (1),	/* Mult_addsub.  */
-+      COSTS_N_INSNS (1),	/* Fma.  */
-+      COSTS_N_INSNS (1),	/* Addsub.  */
-+      COSTS_N_INSNS (1),	/* Fpconst.  */
-+      COSTS_N_INSNS (1),	/* Neg.  */
-+      COSTS_N_INSNS (1),	/* Compare.  */
-+      COSTS_N_INSNS (2),	/* Widen.  */
-+      COSTS_N_INSNS (2),	/* Narrow.  */
-+      COSTS_N_INSNS (2),	/* Toint.  */
-+      COSTS_N_INSNS (2),	/* Fromint.  */
-+      COSTS_N_INSNS (2) 	/* Roundint.  */
-+    }
-+  },
-+  /* Vector */
-+  {
-+    COSTS_N_INSNS (1)	/* Alu.  */
-+  }
-+};
- 
- 
- #endif
---- a/src/gcc/config/aarch64/aarch64-elf.h
-+++ b/src/gcc/config/aarch64/aarch64-elf.h
-@@ -25,15 +25,6 @@
- #define ASM_OUTPUT_LABELREF(FILE, NAME) \
-   aarch64_asm_output_labelref (FILE, NAME)
- 
--#define ASM_OUTPUT_DEF(FILE, NAME1, NAME2)	\
--  do						\
--    {						\
--      assemble_name (FILE, NAME1);		\
--      fputs (" = ", FILE);			\
--      assemble_name (FILE, NAME2);		\
--      fputc ('\n', FILE);			\
--    } while (0)
--
- #define TEXT_SECTION_ASM_OP	"\t.text"
- #define DATA_SECTION_ASM_OP	"\t.data"
- #define BSS_SECTION_ASM_OP	"\t.bss"
---- a/src/gcc/config/aarch64/aarch64-modes.def
-+++ b/src/gcc/config/aarch64/aarch64-modes.def
-@@ -21,8 +21,6 @@
- CC_MODE (CCFP);
- CC_MODE (CCFPE);
- CC_MODE (CC_SWP);
--CC_MODE (CC_ZESWP); /* zero-extend LHS (but swap to make it RHS).  */
--CC_MODE (CC_SESWP); /* sign-extend LHS (but swap to make it RHS).  */
- CC_MODE (CC_NZ);    /* Only N and Z bits of condition flags are valid.  */
- CC_MODE (CC_Z);     /* Only Z bit of condition flags is valid.  */
- CC_MODE (CC_C);     /* Only C bit of condition flags is valid.  */
---- a/src/gcc/config/aarch64/aarch64-option-extensions.def
-+++ b/src/gcc/config/aarch64/aarch64-option-extensions.def
-@@ -39,8 +39,8 @@
-    that are required.  Their order is not important.  */
- 
- /* Enabling "fp" just enables "fp".
--   Disabling "fp" also disables "simd", "crypto".  */
--AARCH64_OPT_EXTENSION("fp", AARCH64_FL_FP, 0, AARCH64_FL_SIMD | AARCH64_FL_CRYPTO, "fp")
-+   Disabling "fp" also disables "simd", "crypto" and "fp16".  */
-+AARCH64_OPT_EXTENSION("fp", AARCH64_FL_FP, 0, AARCH64_FL_SIMD | AARCH64_FL_CRYPTO | AARCH64_FL_F16, "fp")
- 
- /* Enabling "simd" also enables "fp".
-    Disabling "simd" also disables "crypto".  */
-@@ -55,3 +55,7 @@ AARCH64_OPT_EXTENSION("crc", AARCH64_FL_CRC, 0, 0, "crc32")
- 
- /* Enabling or disabling "lse" only changes "lse".  */
- AARCH64_OPT_EXTENSION("lse", AARCH64_FL_LSE, 0, 0, "atomics")
-+
-+/* Enabling "fp16" also enables "fp".
-+   Disabling "fp16" just disables "fp16".  */
-+AARCH64_OPT_EXTENSION("fp16", AARCH64_FL_F16, AARCH64_FL_FP, 0, "fp16")
---- /dev/null
-+++ b/src/gcc/config/aarch64/aarch64-passes.def
-@@ -0,0 +1,21 @@
-+/* AArch64-specific passes declarations.
-+   Copyright (C) 2016 Free Software Foundation, Inc.
-+   Contributed by ARM Ltd.
-+
-+   This file is part of GCC.
-+
-+   GCC is free software; you can redistribute it and/or modify it
-+   under the terms of the GNU General Public License as published by
-+   the Free Software Foundation; either version 3, or (at your option)
-+   any later version.
-+
-+   GCC is distributed in the hope that it will be useful, but
-+   WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   General Public License for more details.
-+
-+   You should have received a copy of the GNU General Public License
-+   along with GCC; see the file COPYING3.  If not see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+INSERT_PASS_AFTER (pass_regrename, 1, pass_fma_steering);
---- a/src/gcc/config/aarch64/aarch64-protos.h
-+++ b/src/gcc/config/aarch64/aarch64-protos.h
-@@ -178,6 +178,25 @@ struct cpu_branch_cost
-   const int unpredictable;  /* Unpredictable branch or optimizing for speed.  */
- };
- 
-+/* Control approximate alternatives to certain FP operators.  */
-+#define AARCH64_APPROX_MODE(MODE) \
-+  ((MIN_MODE_FLOAT <= (MODE) && (MODE) <= MAX_MODE_FLOAT) \
-+   ? (1 << ((MODE) - MIN_MODE_FLOAT)) \
-+   : (MIN_MODE_VECTOR_FLOAT <= (MODE) && (MODE) <= MAX_MODE_VECTOR_FLOAT) \
-+     ? (1 << ((MODE) - MIN_MODE_VECTOR_FLOAT \
-+	      + MAX_MODE_FLOAT - MIN_MODE_FLOAT + 1)) \
-+     : (0))
-+#define AARCH64_APPROX_NONE (0)
-+#define AARCH64_APPROX_ALL (-1)
-+
-+/* Allowed modes for approximations.  */
-+struct cpu_approx_modes
-+{
-+  const unsigned int division;		/* Division.  */
-+  const unsigned int sqrt;		/* Square root.  */
-+  const unsigned int recip_sqrt;	/* Reciprocal square root.  */
-+};
-+
- struct tune_params
- {
-   const struct cpu_cost_table *insn_extra_cost;
-@@ -185,6 +204,7 @@ struct tune_params
-   const struct cpu_regmove_cost *regmove_cost;
-   const struct cpu_vector_cost *vec_costs;
-   const struct cpu_branch_cost *branch_costs;
-+  const struct cpu_approx_modes *approx_modes;
-   int memmov_cost;
-   int issue_rate;
-   unsigned int fusible_ops;
-@@ -282,14 +302,14 @@ int aarch64_get_condition_code (rtx);
- bool aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode);
- int aarch64_branch_cost (bool, bool);
- enum aarch64_symbol_type aarch64_classify_symbolic_expression (rtx);
--bool aarch64_cannot_change_mode_class (machine_mode,
--				       machine_mode,
--				       enum reg_class);
- bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
- bool aarch64_constant_address_p (rtx);
-+bool aarch64_emit_approx_div (rtx, rtx, rtx);
-+bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
- bool aarch64_expand_movmem (rtx *);
- bool aarch64_float_const_zero_rtx_p (rtx);
- bool aarch64_function_arg_regno_p (unsigned);
-+bool aarch64_fusion_enabled_p (enum aarch64_fusion_pairs);
- bool aarch64_gen_movmemqi (rtx *);
- bool aarch64_gimple_fold_builtin (gimple_stmt_iterator *);
- bool aarch64_is_extend_from_extract (machine_mode, rtx, rtx);
-@@ -298,6 +318,7 @@ bool aarch64_is_noplt_call_p (rtx);
- bool aarch64_label_mentioned_p (rtx);
- void aarch64_declare_function_name (FILE *, const char*, tree);
- bool aarch64_legitimate_pic_operand_p (rtx);
-+bool aarch64_mask_and_shift_for_ubfiz_p (machine_mode, rtx, rtx);
- bool aarch64_modes_tieable_p (machine_mode mode1,
- 			      machine_mode mode2);
- bool aarch64_zero_extend_const_eq (machine_mode, rtx, machine_mode, rtx);
-@@ -320,6 +341,7 @@ bool aarch64_simd_scalar_immediate_valid_for_move (rtx, machine_mode);
- bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool);
- bool aarch64_simd_valid_immediate (rtx, machine_mode, bool,
- 				   struct simd_immediate_info *);
-+bool aarch64_split_dimode_const_store (rtx, rtx);
- bool aarch64_symbolic_address_p (rtx);
- bool aarch64_uimm12_shift (HOST_WIDE_INT);
- bool aarch64_use_return_insn_p (void);
-@@ -335,11 +357,9 @@ machine_mode aarch64_hard_regno_caller_save_mode (unsigned, unsigned,
- 						       machine_mode);
- int aarch64_hard_regno_mode_ok (unsigned, machine_mode);
- int aarch64_hard_regno_nregs (unsigned, machine_mode);
--int aarch64_simd_attr_length_move (rtx_insn *);
- int aarch64_uxt_size (int, HOST_WIDE_INT);
- int aarch64_vec_fpconst_pow_of_2 (rtx);
- rtx aarch64_eh_return_handler_rtx (void);
--rtx aarch64_legitimize_reload_address (rtx *, machine_mode, int, int, int);
- rtx aarch64_mask_from_zextract_ops (rtx, rtx);
- const char *aarch64_output_move_struct (rtx *operands);
- rtx aarch64_return_addr (int, rtx);
-@@ -352,7 +372,6 @@ unsigned aarch64_dbx_register_number (unsigned);
- unsigned aarch64_trampoline_size (void);
- void aarch64_asm_output_labelref (FILE *, const char *);
- void aarch64_cpu_cpp_builtins (cpp_reader *);
--void aarch64_elf_asm_named_section (const char *, unsigned, tree);
- const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *);
- const char * aarch64_output_probe_stack_range (rtx, rtx);
- void aarch64_err_no_fpadvsimd (machine_mode, const char *);
-@@ -369,7 +388,6 @@ void aarch64_register_pragmas (void);
- void aarch64_relayout_simd_types (void);
- void aarch64_reset_previous_fndecl (void);
- void aarch64_save_restore_target_globals (tree);
--void aarch64_emit_approx_rsqrt (rtx, rtx);
- 
- /* Initialize builtins for SIMD intrinsics.  */
- void init_aarch64_simd_builtins (void);
-@@ -436,7 +454,6 @@ int aarch64_ccmp_mode_to_code (enum machine_mode mode);
- bool extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset);
- bool aarch64_operands_ok_for_ldpstp (rtx *, bool, enum machine_mode);
- bool aarch64_operands_adjust_ok_for_ldpstp (rtx *, bool, enum machine_mode);
--extern bool aarch64_nopcrelative_literal_loads;
- 
- extern void aarch64_asm_output_pool_epilogue (FILE *, const char *,
- 					      tree, HOST_WIDE_INT);
-@@ -450,4 +467,6 @@ enum aarch64_parse_opt_result aarch64_parse_extension (const char *,
- std::string aarch64_get_extension_string_for_isa_flags (unsigned long,
- 							unsigned long);
- 
-+rtl_opt_pass *make_pass_fma_steering (gcc::context *ctxt);
-+
- #endif /* GCC_AARCH64_PROTOS_H */
---- a/src/gcc/config/aarch64/aarch64-simd-builtins.def
-+++ b/src/gcc/config/aarch64/aarch64-simd-builtins.def
-@@ -40,9 +40,10 @@
-    10 - CODE_FOR_<name><mode>.  */
- 
-   BUILTIN_VDC (COMBINE, combine, 0)
-+  VAR1 (COMBINEP, combine, 0, di)
-   BUILTIN_VB (BINOP, pmul, 0)
--  BUILTIN_VALLF (BINOP, fmulx, 0)
--  BUILTIN_VDQF_DF (UNOP, sqrt, 2)
-+  BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0)
-+  BUILTIN_VHSDF_DF (UNOP, sqrt, 2)
-   BUILTIN_VD_BHSI (BINOP, addp, 0)
-   VAR1 (UNOP, addp, 0, di)
-   BUILTIN_VDQ_BHSI (UNOP, clrsb, 2)
-@@ -68,14 +69,23 @@
-   BUILTIN_VDC (GETREG, get_dregoi, 0)
-   BUILTIN_VDC (GETREG, get_dregci, 0)
-   BUILTIN_VDC (GETREG, get_dregxi, 0)
-+  VAR1 (GETREGP, get_dregoi, 0, di)
-+  VAR1 (GETREGP, get_dregci, 0, di)
-+  VAR1 (GETREGP, get_dregxi, 0, di)
-   /* Implemented by aarch64_get_qreg<VSTRUCT:mode><VQ:mode>.  */
-   BUILTIN_VQ (GETREG, get_qregoi, 0)
-   BUILTIN_VQ (GETREG, get_qregci, 0)
-   BUILTIN_VQ (GETREG, get_qregxi, 0)
-+  VAR1 (GETREGP, get_qregoi, 0, v2di)
-+  VAR1 (GETREGP, get_qregci, 0, v2di)
-+  VAR1 (GETREGP, get_qregxi, 0, v2di)
-   /* Implemented by aarch64_set_qreg<VSTRUCT:mode><VQ:mode>.  */
-   BUILTIN_VQ (SETREG, set_qregoi, 0)
-   BUILTIN_VQ (SETREG, set_qregci, 0)
-   BUILTIN_VQ (SETREG, set_qregxi, 0)
-+  VAR1 (SETREGP, set_qregoi, 0, v2di)
-+  VAR1 (SETREGP, set_qregci, 0, v2di)
-+  VAR1 (SETREGP, set_qregxi, 0, v2di)
-   /* Implemented by aarch64_ld<VSTRUCT:nregs><VDC:mode>.  */
-   BUILTIN_VDC (LOADSTRUCT, ld2, 0)
-   BUILTIN_VDC (LOADSTRUCT, ld3, 0)
-@@ -224,6 +234,7 @@
-   BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssri_n, 0)
-   BUILTIN_VSDQ_I_DI (USHIFTACC, usri_n, 0)
-   BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssli_n, 0)
-+  VAR2 (SHIFTINSERTP, ssli_n, 0, di, v2di)
-   BUILTIN_VSDQ_I_DI (USHIFTACC, usli_n, 0)
-   /* Implemented by aarch64_<sur>qshl<u>_n<mode>.  */
-   BUILTIN_VSDQ_I (SHIFTIMM_USS, sqshlu_n, 0)
-@@ -234,105 +245,145 @@
-   BUILTIN_VALL (UNOP, reduc_plus_scal_, 10)
- 
-   /* Implemented by reduc_<maxmin_uns>_scal_<mode> (producing scalar).  */
--  BUILTIN_VDQIF (UNOP, reduc_smax_scal_, 10)
--  BUILTIN_VDQIF (UNOP, reduc_smin_scal_, 10)
-+  BUILTIN_VDQIF_F16 (UNOP, reduc_smax_scal_, 10)
-+  BUILTIN_VDQIF_F16 (UNOP, reduc_smin_scal_, 10)
-   BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10)
-   BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10)
--  BUILTIN_VDQF (UNOP, reduc_smax_nan_scal_, 10)
--  BUILTIN_VDQF (UNOP, reduc_smin_nan_scal_, 10)
-+  BUILTIN_VHSDF (UNOP, reduc_smax_nan_scal_, 10)
-+  BUILTIN_VHSDF (UNOP, reduc_smin_nan_scal_, 10)
- 
--  /* Implemented by <maxmin><mode>3.
-+  /* Implemented by <maxmin_uns><mode>3.
-      smax variants map to fmaxnm,
-      smax_nan variants map to fmax.  */
-   BUILTIN_VDQ_BHSI (BINOP, smax, 3)
-   BUILTIN_VDQ_BHSI (BINOP, smin, 3)
-   BUILTIN_VDQ_BHSI (BINOP, umax, 3)
-   BUILTIN_VDQ_BHSI (BINOP, umin, 3)
--  BUILTIN_VDQF (BINOP, smax_nan, 3)
--  BUILTIN_VDQF (BINOP, smin_nan, 3)
-+  BUILTIN_VHSDF_DF (BINOP, smax_nan, 3)
-+  BUILTIN_VHSDF_DF (BINOP, smin_nan, 3)
- 
--  /* Implemented by <fmaxmin><mode>3.  */
--  BUILTIN_VDQF (BINOP, fmax, 3)
--  BUILTIN_VDQF (BINOP, fmin, 3)
-+  /* Implemented by <maxmin_uns><mode>3.  */
-+  BUILTIN_VHSDF_HSDF (BINOP, fmax, 3)
-+  BUILTIN_VHSDF_HSDF (BINOP, fmin, 3)
- 
-   /* Implemented by aarch64_<maxmin_uns>p<mode>.  */
-   BUILTIN_VDQ_BHSI (BINOP, smaxp, 0)
-   BUILTIN_VDQ_BHSI (BINOP, sminp, 0)
-   BUILTIN_VDQ_BHSI (BINOP, umaxp, 0)
-   BUILTIN_VDQ_BHSI (BINOP, uminp, 0)
--  BUILTIN_VDQF (BINOP, smaxp, 0)
--  BUILTIN_VDQF (BINOP, sminp, 0)
--  BUILTIN_VDQF (BINOP, smax_nanp, 0)
--  BUILTIN_VDQF (BINOP, smin_nanp, 0)
-+  BUILTIN_VHSDF (BINOP, smaxp, 0)
-+  BUILTIN_VHSDF (BINOP, sminp, 0)
-+  BUILTIN_VHSDF (BINOP, smax_nanp, 0)
-+  BUILTIN_VHSDF (BINOP, smin_nanp, 0)
- 
-   /* Implemented by <frint_pattern><mode>2.  */
--  BUILTIN_VDQF (UNOP, btrunc, 2)
--  BUILTIN_VDQF (UNOP, ceil, 2)
--  BUILTIN_VDQF (UNOP, floor, 2)
--  BUILTIN_VDQF (UNOP, nearbyint, 2)
--  BUILTIN_VDQF (UNOP, rint, 2)
--  BUILTIN_VDQF (UNOP, round, 2)
--  BUILTIN_VDQF_DF (UNOP, frintn, 2)
-+  BUILTIN_VHSDF (UNOP, btrunc, 2)
-+  BUILTIN_VHSDF (UNOP, ceil, 2)
-+  BUILTIN_VHSDF (UNOP, floor, 2)
-+  BUILTIN_VHSDF (UNOP, nearbyint, 2)
-+  BUILTIN_VHSDF (UNOP, rint, 2)
-+  BUILTIN_VHSDF (UNOP, round, 2)
-+  BUILTIN_VHSDF_DF (UNOP, frintn, 2)
-+
-+  VAR1 (UNOP, btrunc, 2, hf)
-+  VAR1 (UNOP, ceil, 2, hf)
-+  VAR1 (UNOP, floor, 2, hf)
-+  VAR1 (UNOP, frintn, 2, hf)
-+  VAR1 (UNOP, nearbyint, 2, hf)
-+  VAR1 (UNOP, rint, 2, hf)
-+  VAR1 (UNOP, round, 2, hf)
- 
-   /* Implemented by l<fcvt_pattern><su_optab><VQDF:mode><vcvt_target>2.  */
-+  VAR1 (UNOP, lbtruncv4hf, 2, v4hi)
-+  VAR1 (UNOP, lbtruncv8hf, 2, v8hi)
-   VAR1 (UNOP, lbtruncv2sf, 2, v2si)
-   VAR1 (UNOP, lbtruncv4sf, 2, v4si)
-   VAR1 (UNOP, lbtruncv2df, 2, v2di)
- 
-+  VAR1 (UNOPUS, lbtruncuv4hf, 2, v4hi)
-+  VAR1 (UNOPUS, lbtruncuv8hf, 2, v8hi)
-   VAR1 (UNOPUS, lbtruncuv2sf, 2, v2si)
-   VAR1 (UNOPUS, lbtruncuv4sf, 2, v4si)
-   VAR1 (UNOPUS, lbtruncuv2df, 2, v2di)
- 
-+  VAR1 (UNOP, lroundv4hf, 2, v4hi)
-+  VAR1 (UNOP, lroundv8hf, 2, v8hi)
-   VAR1 (UNOP, lroundv2sf, 2, v2si)
-   VAR1 (UNOP, lroundv4sf, 2, v4si)
-   VAR1 (UNOP, lroundv2df, 2, v2di)
--  /* Implemented by l<fcvt_pattern><su_optab><GPF:mode><GPI:mode>2.  */
-+  /* Implemented by l<fcvt_pattern><su_optab><GPF_F16:mode><GPI:mode>2.  */
-+  BUILTIN_GPI_I16 (UNOP, lroundhf, 2)
-   VAR1 (UNOP, lroundsf, 2, si)
-   VAR1 (UNOP, lrounddf, 2, di)
- 
-+  VAR1 (UNOPUS, lrounduv4hf, 2, v4hi)
-+  VAR1 (UNOPUS, lrounduv8hf, 2, v8hi)
-   VAR1 (UNOPUS, lrounduv2sf, 2, v2si)
-   VAR1 (UNOPUS, lrounduv4sf, 2, v4si)
-   VAR1 (UNOPUS, lrounduv2df, 2, v2di)
-+  BUILTIN_GPI_I16 (UNOPUS, lrounduhf, 2)
-   VAR1 (UNOPUS, lroundusf, 2, si)
-   VAR1 (UNOPUS, lroundudf, 2, di)
- 
-+  VAR1 (UNOP, lceilv4hf, 2, v4hi)
-+  VAR1 (UNOP, lceilv8hf, 2, v8hi)
-   VAR1 (UNOP, lceilv2sf, 2, v2si)
-   VAR1 (UNOP, lceilv4sf, 2, v4si)
-   VAR1 (UNOP, lceilv2df, 2, v2di)
-+  BUILTIN_GPI_I16 (UNOP, lceilhf, 2)
- 
-+  VAR1 (UNOPUS, lceiluv4hf, 2, v4hi)
-+  VAR1 (UNOPUS, lceiluv8hf, 2, v8hi)
-   VAR1 (UNOPUS, lceiluv2sf, 2, v2si)
-   VAR1 (UNOPUS, lceiluv4sf, 2, v4si)
-   VAR1 (UNOPUS, lceiluv2df, 2, v2di)
-+  BUILTIN_GPI_I16 (UNOPUS, lceiluhf, 2)
-   VAR1 (UNOPUS, lceilusf, 2, si)
-   VAR1 (UNOPUS, lceiludf, 2, di)
- 
-+  VAR1 (UNOP, lfloorv4hf, 2, v4hi)
-+  VAR1 (UNOP, lfloorv8hf, 2, v8hi)
-   VAR1 (UNOP, lfloorv2sf, 2, v2si)
-   VAR1 (UNOP, lfloorv4sf, 2, v4si)
-   VAR1 (UNOP, lfloorv2df, 2, v2di)
-+  BUILTIN_GPI_I16 (UNOP, lfloorhf, 2)
- 
-+  VAR1 (UNOPUS, lflooruv4hf, 2, v4hi)
-+  VAR1 (UNOPUS, lflooruv8hf, 2, v8hi)
-   VAR1 (UNOPUS, lflooruv2sf, 2, v2si)
-   VAR1 (UNOPUS, lflooruv4sf, 2, v4si)
-   VAR1 (UNOPUS, lflooruv2df, 2, v2di)
-+  BUILTIN_GPI_I16 (UNOPUS, lflooruhf, 2)
-   VAR1 (UNOPUS, lfloorusf, 2, si)
-   VAR1 (UNOPUS, lfloorudf, 2, di)
- 
-+  VAR1 (UNOP, lfrintnv4hf, 2, v4hi)
-+  VAR1 (UNOP, lfrintnv8hf, 2, v8hi)
-   VAR1 (UNOP, lfrintnv2sf, 2, v2si)
-   VAR1 (UNOP, lfrintnv4sf, 2, v4si)
-   VAR1 (UNOP, lfrintnv2df, 2, v2di)
-+  BUILTIN_GPI_I16 (UNOP, lfrintnhf, 2)
-   VAR1 (UNOP, lfrintnsf, 2, si)
-   VAR1 (UNOP, lfrintndf, 2, di)
- 
-+  VAR1 (UNOPUS, lfrintnuv4hf, 2, v4hi)
-+  VAR1 (UNOPUS, lfrintnuv8hf, 2, v8hi)
-   VAR1 (UNOPUS, lfrintnuv2sf, 2, v2si)
-   VAR1 (UNOPUS, lfrintnuv4sf, 2, v4si)
-   VAR1 (UNOPUS, lfrintnuv2df, 2, v2di)
-+  BUILTIN_GPI_I16 (UNOPUS, lfrintnuhf, 2)
-   VAR1 (UNOPUS, lfrintnusf, 2, si)
-   VAR1 (UNOPUS, lfrintnudf, 2, di)
- 
-   /* Implemented by <optab><fcvt_target><VDQF:mode>2.  */
-+  VAR1 (UNOP, floatv4hi, 2, v4hf)
-+  VAR1 (UNOP, floatv8hi, 2, v8hf)
-   VAR1 (UNOP, floatv2si, 2, v2sf)
-   VAR1 (UNOP, floatv4si, 2, v4sf)
-   VAR1 (UNOP, floatv2di, 2, v2df)
- 
-+  VAR1 (UNOP, floatunsv4hi, 2, v4hf)
-+  VAR1 (UNOP, floatunsv8hi, 2, v8hf)
-   VAR1 (UNOP, floatunsv2si, 2, v2sf)
-   VAR1 (UNOP, floatunsv4si, 2, v4sf)
-   VAR1 (UNOP, floatunsv2di, 2, v2df)
-@@ -352,19 +403,19 @@
- 
-   /* Implemented by
-      aarch64_frecp<FRECP:frecp_suffix><mode>.  */
--  BUILTIN_GPF (UNOP, frecpe, 0)
--  BUILTIN_GPF (BINOP, frecps, 0)
--  BUILTIN_GPF (UNOP, frecpx, 0)
-+  BUILTIN_GPF_F16 (UNOP, frecpe, 0)
-+  BUILTIN_GPF_F16 (UNOP, frecpx, 0)
- 
-   BUILTIN_VDQ_SI (UNOP, urecpe, 0)
- 
--  BUILTIN_VDQF (UNOP, frecpe, 0)
--  BUILTIN_VDQF (BINOP, frecps, 0)
-+  BUILTIN_VHSDF (UNOP, frecpe, 0)
-+  BUILTIN_VHSDF_HSDF (BINOP, frecps, 0)
- 
-   /* Implemented by a mixture of abs2 patterns.  Note the DImode builtin is
-      only ever used for the int64x1_t intrinsic, there is no scalar version.  */
-   BUILTIN_VSDQ_I_DI (UNOP, abs, 0)
--  BUILTIN_VDQF (UNOP, abs, 2)
-+  BUILTIN_VHSDF (UNOP, abs, 2)
-+  VAR1 (UNOP, abs, 2, hf)
- 
-   BUILTIN_VQ_HSF (UNOP, vec_unpacks_hi_, 10)
-   VAR1 (BINOP, float_truncate_hi_, 0, v4sf)
-@@ -376,15 +427,22 @@
- 
-   /* Implemented by aarch64_ld1<VALL_F16:mode>.  */
-   BUILTIN_VALL_F16 (LOAD1, ld1, 0)
-+  VAR1(STORE1P, ld1, 0, v2di)
- 
-   /* Implemented by aarch64_st1<VALL_F16:mode>.  */
-   BUILTIN_VALL_F16 (STORE1, st1, 0)
-+  VAR1(STORE1P, st1, 0, v2di)
- 
-   /* Implemented by fma<mode>4.  */
--  BUILTIN_VDQF (TERNOP, fma, 4)
-+  BUILTIN_VHSDF (TERNOP, fma, 4)
-+  VAR1 (TERNOP, fma, 4, hf)
-+  /* Implemented by fnma<mode>4.  */
-+  BUILTIN_VHSDF (TERNOP, fnma, 4)
-+  VAR1 (TERNOP, fnma, 4, hf)
- 
-   /* Implemented by aarch64_simd_bsl<mode>.  */
-   BUILTIN_VDQQH (BSL_P, simd_bsl, 0)
-+  VAR2 (BSL_P, simd_bsl,0, di, v2di)
-   BUILTIN_VSDQ_I_DI (BSL_U, simd_bsl, 0)
-   BUILTIN_VALLDIF (BSL_S, simd_bsl, 0)
- 
-@@ -436,7 +494,7 @@
-   VAR1 (TERNOP, qtbx4, 0, v8qi)
-   VAR1 (TERNOP, qtbx4, 0, v16qi)
- 
--  /* Builtins for ARMv8.1 Adv.SIMD instructions.  */
-+  /* Builtins for ARMv8.1-A Adv.SIMD instructions.  */
- 
-   /* Implemented by aarch64_sqrdml<SQRDMLH_AS:rdma_as>h<mode>.  */
-   BUILTIN_VSDQ_HSI (TERNOP, sqrdmlah, 0)
-@@ -449,3 +507,60 @@
-   /* Implemented by aarch64_sqrdml<SQRDMLH_AS:rdma_as>h_laneq<mode>.  */
-   BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlah_laneq, 0)
-   BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlsh_laneq, 0)
-+
-+  /* Implemented by <FCVT_F2FIXED/FIXED2F:fcvt_fixed_insn><*><*>3.  */
-+  BUILTIN_VSDQ_HSDI (SHIFTIMM, scvtf, 3)
-+  BUILTIN_VSDQ_HSDI (FCVTIMM_SUS, ucvtf, 3)
-+  BUILTIN_VHSDF_HSDF (SHIFTIMM, fcvtzs, 3)
-+  BUILTIN_VHSDF_HSDF (SHIFTIMM_USS, fcvtzu, 3)
-+  VAR1 (SHIFTIMM, scvtfsi, 3, hf)
-+  VAR1 (SHIFTIMM, scvtfdi, 3, hf)
-+  VAR1 (FCVTIMM_SUS, ucvtfsi, 3, hf)
-+  VAR1 (FCVTIMM_SUS, ucvtfdi, 3, hf)
-+  BUILTIN_GPI (SHIFTIMM, fcvtzshf, 3)
-+  BUILTIN_GPI (SHIFTIMM_USS, fcvtzuhf, 3)
-+
-+  /* Implemented by aarch64_rsqrte<mode>.  */
-+  BUILTIN_VHSDF_HSDF (UNOP, rsqrte, 0)
-+
-+  /* Implemented by aarch64_rsqrts<mode>.  */
-+  BUILTIN_VHSDF_HSDF (BINOP, rsqrts, 0)
-+
-+  /* Implemented by fabd<mode>3.  */
-+  BUILTIN_VHSDF_HSDF (BINOP, fabd, 3)
-+
-+  /* Implemented by aarch64_faddp<mode>.  */
-+  BUILTIN_VHSDF (BINOP, faddp, 0)
-+
-+  /* Implemented by aarch64_cm<optab><mode>.  */
-+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmeq, 0)
-+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmge, 0)
-+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmgt, 0)
-+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmle, 0)
-+  BUILTIN_VHSDF_HSDF (BINOP_USS, cmlt, 0)
-+
-+  /* Implemented by neg<mode>2.  */
-+  BUILTIN_VHSDF_HSDF (UNOP, neg, 2)
-+
-+  /* Implemented by aarch64_fac<optab><mode>.  */
-+  BUILTIN_VHSDF_HSDF (BINOP_USS, faclt, 0)
-+  BUILTIN_VHSDF_HSDF (BINOP_USS, facle, 0)
-+  BUILTIN_VHSDF_HSDF (BINOP_USS, facgt, 0)
-+  BUILTIN_VHSDF_HSDF (BINOP_USS, facge, 0)
-+
-+  /* Implemented by sqrt<mode>2.  */
-+  VAR1 (UNOP, sqrt, 2, hf)
-+
-+  /* Implemented by <optab><mode>hf2.  */
-+  VAR1 (UNOP, floatdi, 2, hf)
-+  VAR1 (UNOP, floatsi, 2, hf)
-+  VAR1 (UNOP, floathi, 2, hf)
-+  VAR1 (UNOPUS, floatunsdi, 2, hf)
-+  VAR1 (UNOPUS, floatunssi, 2, hf)
-+  VAR1 (UNOPUS, floatunshi, 2, hf)
-+  BUILTIN_GPI_I16 (UNOP, fix_trunchf, 2)
-+  BUILTIN_GPI (UNOP, fix_truncsf, 2)
-+  BUILTIN_GPI (UNOP, fix_truncdf, 2)
-+  BUILTIN_GPI_I16 (UNOPUS, fixuns_trunchf, 2)
-+  BUILTIN_GPI (UNOPUS, fixuns_truncsf, 2)
-+  BUILTIN_GPI (UNOPUS, fixuns_truncdf, 2)
-\ No newline at end of file
---- a/src/gcc/config/aarch64/aarch64-simd.md
-+++ b/src/gcc/config/aarch64/aarch64-simd.md
-@@ -351,7 +351,7 @@
-     operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
-     return "<f>mul\\t%0.<Vtype>, %3.<Vtype>, %1.<Vetype>[%2]";
-   }
--  [(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")]
-+  [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")]
- )
- 
- (define_insn "*aarch64_mul3_elt_<vswap_width_name><mode>"
-@@ -371,33 +371,33 @@
-   [(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")]
- )
- 
--(define_insn "*aarch64_mul3_elt_to_128df"
--  [(set (match_operand:V2DF 0 "register_operand" "=w")
--     (mult:V2DF
--       (vec_duplicate:V2DF
--	 (match_operand:DF 2 "register_operand" "w"))
--      (match_operand:V2DF 1 "register_operand" "w")))]
-+(define_insn "*aarch64_mul3_elt_from_dup<mode>"
-+ [(set (match_operand:VMUL 0 "register_operand" "=w")
-+    (mult:VMUL
-+      (vec_duplicate:VMUL
-+	    (match_operand:<VEL> 1 "register_operand" "<h_con>"))
-+      (match_operand:VMUL 2 "register_operand" "w")))]
-   "TARGET_SIMD"
--  "fmul\\t%0.2d, %1.2d, %2.d[0]"
--  [(set_attr "type" "neon_fp_mul_d_scalar_q")]
-+  "<f>mul\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]";
-+  [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")]
- )
- 
--(define_insn "aarch64_rsqrte_<mode>2"
--  [(set (match_operand:VALLF 0 "register_operand" "=w")
--	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
-+(define_insn "aarch64_rsqrte<mode>"
-+  [(set (match_operand:VHSDF_HSDF 0 "register_operand" "=w")
-+	(unspec:VHSDF_HSDF [(match_operand:VHSDF_HSDF 1 "register_operand" "w")]
- 		     UNSPEC_RSQRTE))]
-   "TARGET_SIMD"
-   "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
--  [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
-+  [(set_attr "type" "neon_fp_rsqrte_<stype><q>")])
- 
--(define_insn "aarch64_rsqrts_<mode>3"
--  [(set (match_operand:VALLF 0 "register_operand" "=w")
--	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
--	       (match_operand:VALLF 2 "register_operand" "w")]
--		     UNSPEC_RSQRTS))]
-+(define_insn "aarch64_rsqrts<mode>"
-+  [(set (match_operand:VHSDF_HSDF 0 "register_operand" "=w")
-+	(unspec:VHSDF_HSDF [(match_operand:VHSDF_HSDF 1 "register_operand" "w")
-+			    (match_operand:VHSDF_HSDF 2 "register_operand" "w")]
-+	 UNSPEC_RSQRTS))]
-   "TARGET_SIMD"
-   "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
--  [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")])
-+  [(set_attr "type" "neon_fp_rsqrts_<stype><q>")])
- 
- (define_expand "rsqrt<mode>2"
-   [(set (match_operand:VALLF 0 "register_operand" "=w")
-@@ -405,7 +405,7 @@
- 		     UNSPEC_RSQRT))]
-   "TARGET_SIMD"
- {
--  aarch64_emit_approx_rsqrt (operands[0], operands[1]);
-+  aarch64_emit_approx_sqrt (operands[0], operands[1], true);
-   DONE;
- })
- 
-@@ -474,24 +474,15 @@
-   [(set_attr "type" "neon_arith_acc<q>")]
- )
- 
--(define_insn "fabd<mode>_3"
--  [(set (match_operand:VDQF 0 "register_operand" "=w")
--	(abs:VDQF (minus:VDQF
--		   (match_operand:VDQF 1 "register_operand" "w")
--		   (match_operand:VDQF 2 "register_operand" "w"))))]
--  "TARGET_SIMD"
--  "fabd\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
--  [(set_attr "type" "neon_fp_abd_<Vetype><q>")]
--)
--
--(define_insn "*fabd_scalar<mode>3"
--  [(set (match_operand:GPF 0 "register_operand" "=w")
--        (abs:GPF (minus:GPF
--                 (match_operand:GPF 1 "register_operand" "w")
--                 (match_operand:GPF 2 "register_operand" "w"))))]
-+(define_insn "fabd<mode>3"
-+  [(set (match_operand:VHSDF_HSDF 0 "register_operand" "=w")
-+	(abs:VHSDF_HSDF
-+	  (minus:VHSDF_HSDF
-+	    (match_operand:VHSDF_HSDF 1 "register_operand" "w")
-+	    (match_operand:VHSDF_HSDF 2 "register_operand" "w"))))]
-   "TARGET_SIMD"
--  "fabd\t%<s>0, %<s>1, %<s>2"
--  [(set_attr "type" "neon_fp_abd_<Vetype><q>")]
-+  "fabd\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
-+  [(set_attr "type" "neon_fp_abd_<stype><q>")]
- )
- 
- (define_insn "and<mode>3"
-@@ -555,6 +546,49 @@
-   [(set_attr "type" "neon_from_gp<q>, neon_ins<q>, neon_load1_1reg<q>")]
- )
- 
-+(define_insn "*aarch64_simd_vec_copy_lane<mode>"
-+  [(set (match_operand:VALL 0 "register_operand" "=w")
-+	(vec_merge:VALL
-+	    (vec_duplicate:VALL
-+	      (vec_select:<VEL>
-+		(match_operand:VALL 3 "register_operand" "w")
-+		(parallel
-+		  [(match_operand:SI 4 "immediate_operand" "i")])))
-+	    (match_operand:VALL 1 "register_operand" "0")
-+	    (match_operand:SI 2 "immediate_operand" "i")))]
-+  "TARGET_SIMD"
-+  {
-+    int elt = ENDIAN_LANE_N (<MODE>mode, exact_log2 (INTVAL (operands[2])));
-+    operands[2] = GEN_INT (HOST_WIDE_INT_1 << elt);
-+    operands[4] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[4])));
-+
-+    return "ins\t%0.<Vetype>[%p2], %3.<Vetype>[%4]";
-+  }
-+  [(set_attr "type" "neon_ins<q>")]
-+)
-+
-+(define_insn "*aarch64_simd_vec_copy_lane_<vswap_width_name><mode>"
-+  [(set (match_operand:VALL 0 "register_operand" "=w")
-+	(vec_merge:VALL
-+	    (vec_duplicate:VALL
-+	      (vec_select:<VEL>
-+		(match_operand:<VSWAP_WIDTH> 3 "register_operand" "w")
-+		(parallel
-+		  [(match_operand:SI 4 "immediate_operand" "i")])))
-+	    (match_operand:VALL 1 "register_operand" "0")
-+	    (match_operand:SI 2 "immediate_operand" "i")))]
-+  "TARGET_SIMD"
-+  {
-+    int elt = ENDIAN_LANE_N (<MODE>mode, exact_log2 (INTVAL (operands[2])));
-+    operands[2] = GEN_INT (HOST_WIDE_INT_1 << elt);
-+    operands[4] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode,
-+			   INTVAL (operands[4])));
-+
-+    return "ins\t%0.<Vetype>[%p2], %3.<Vetype>[%4]";
-+  }
-+  [(set_attr "type" "neon_ins<q>")]
-+)
-+
- (define_insn "aarch64_simd_lshr<mode>"
-  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
-        (lshiftrt:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
-@@ -1071,10 +1105,10 @@
- 
- ;; Pairwise FP Max/Min operations.
- (define_insn "aarch64_<maxmin_uns>p<mode>"
-- [(set (match_operand:VDQF 0 "register_operand" "=w")
--       (unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w")
--		     (match_operand:VDQF 2 "register_operand" "w")]
--		    FMAXMINV))]
-+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
-+       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")
-+		      (match_operand:VHSDF 2 "register_operand" "w")]
-+		      FMAXMINV))]
-  "TARGET_SIMD"
-  "<maxmin_uns_op>p\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
-   [(set_attr "type" "neon_minmax<q>")]
-@@ -1483,65 +1517,77 @@
- ;; FP arithmetic operations.
- 
- (define_insn "add<mode>3"
-- [(set (match_operand:VDQF 0 "register_operand" "=w")
--       (plus:VDQF (match_operand:VDQF 1 "register_operand" "w")
--		  (match_operand:VDQF 2 "register_operand" "w")))]
-+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
-+       (plus:VHSDF (match_operand:VHSDF 1 "register_operand" "w")
-+		   (match_operand:VHSDF 2 "register_operand" "w")))]
-  "TARGET_SIMD"
-  "fadd\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
--  [(set_attr "type" "neon_fp_addsub_<Vetype><q>")]
-+  [(set_attr "type" "neon_fp_addsub_<stype><q>")]
- )
- 
- (define_insn "sub<mode>3"
-- [(set (match_operand:VDQF 0 "register_operand" "=w")
--       (minus:VDQF (match_operand:VDQF 1 "register_operand" "w")
--		   (match_operand:VDQF 2 "register_operand" "w")))]
-+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
-+       (minus:VHSDF (match_operand:VHSDF 1 "register_operand" "w")
-+		    (match_operand:VHSDF 2 "register_operand" "w")))]
-  "TARGET_SIMD"
-  "fsub\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
--  [(set_attr "type" "neon_fp_addsub_<Vetype><q>")]
-+  [(set_attr "type" "neon_fp_addsub_<stype><q>")]
- )
- 
- (define_insn "mul<mode>3"
-- [(set (match_operand:VDQF 0 "register_operand" "=w")
--       (mult:VDQF (match_operand:VDQF 1 "register_operand" "w")
--		  (match_operand:VDQF 2 "register_operand" "w")))]
-+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
-+       (mult:VHSDF (match_operand:VHSDF 1 "register_operand" "w")
-+		   (match_operand:VHSDF 2 "register_operand" "w")))]
-  "TARGET_SIMD"
-  "fmul\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
--  [(set_attr "type" "neon_fp_mul_<Vetype><q>")]
-+  [(set_attr "type" "neon_fp_mul_<stype><q>")]
- )
- 
--(define_insn "div<mode>3"
-- [(set (match_operand:VDQF 0 "register_operand" "=w")
--       (div:VDQF (match_operand:VDQF 1 "register_operand" "w")
--		 (match_operand:VDQF 2 "register_operand" "w")))]
-+(define_expand "div<mode>3"
-+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
-+       (div:VHSDF (match_operand:VHSDF 1 "register_operand" "w")
-+		  (match_operand:VHSDF 2 "register_operand" "w")))]
-+ "TARGET_SIMD"
-+{
-+  if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
-+    DONE;
-+
-+  operands[1] = force_reg (<MODE>mode, operands[1]);
-+})
-+
-+(define_insn "*div<mode>3"
-+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
-+       (div:VHSDF (match_operand:VHSDF 1 "register_operand" "w")
-+		 (match_operand:VHSDF 2 "register_operand" "w")))]
-  "TARGET_SIMD"
-  "fdiv\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
--  [(set_attr "type" "neon_fp_div_<Vetype><q>")]
-+  [(set_attr "type" "neon_fp_div_<stype><q>")]
- )
- 
- (define_insn "neg<mode>2"
-- [(set (match_operand:VDQF 0 "register_operand" "=w")
--       (neg:VDQF (match_operand:VDQF 1 "register_operand" "w")))]
-+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
-+       (neg:VHSDF (match_operand:VHSDF 1 "register_operand" "w")))]
-  "TARGET_SIMD"
-  "fneg\\t%0.<Vtype>, %1.<Vtype>"
--  [(set_attr "type" "neon_fp_neg_<Vetype><q>")]
-+  [(set_attr "type" "neon_fp_neg_<stype><q>")]
- )
- 
- (define_insn "abs<mode>2"
-- [(set (match_operand:VDQF 0 "register_operand" "=w")
--       (abs:VDQF (match_operand:VDQF 1 "register_operand" "w")))]
-+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
-+       (abs:VHSDF (match_operand:VHSDF 1 "register_operand" "w")))]
-  "TARGET_SIMD"
-  "fabs\\t%0.<Vtype>, %1.<Vtype>"
--  [(set_attr "type" "neon_fp_abs_<Vetype><q>")]
-+  [(set_attr "type" "neon_fp_abs_<stype><q>")]
- )
- 
- (define_insn "fma<mode>4"
--  [(set (match_operand:VDQF 0 "register_operand" "=w")
--       (fma:VDQF (match_operand:VDQF 1 "register_operand" "w")
--                (match_operand:VDQF 2 "register_operand" "w")
--                (match_operand:VDQF 3 "register_operand" "0")))]
-+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
-+       (fma:VHSDF (match_operand:VHSDF 1 "register_operand" "w")
-+		  (match_operand:VHSDF 2 "register_operand" "w")
-+		  (match_operand:VHSDF 3 "register_operand" "0")))]
-   "TARGET_SIMD"
-  "fmla\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
--  [(set_attr "type" "neon_fp_mla_<Vetype><q>")]
-+  [(set_attr "type" "neon_fp_mla_<stype><q>")]
- )
- 
- (define_insn "*aarch64_fma4_elt<mode>"
-@@ -1579,16 +1625,16 @@
-   [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
- )
- 
--(define_insn "*aarch64_fma4_elt_to_128df"
--  [(set (match_operand:V2DF 0 "register_operand" "=w")
--    (fma:V2DF
--      (vec_duplicate:V2DF
--	  (match_operand:DF 1 "register_operand" "w"))
--      (match_operand:V2DF 2 "register_operand" "w")
--      (match_operand:V2DF 3 "register_operand" "0")))]
-+(define_insn "*aarch64_fma4_elt_from_dup<mode>"
-+  [(set (match_operand:VMUL 0 "register_operand" "=w")
-+    (fma:VMUL
-+      (vec_duplicate:VMUL
-+	  (match_operand:<VEL> 1 "register_operand" "w"))
-+      (match_operand:VMUL 2 "register_operand" "w")
-+      (match_operand:VMUL 3 "register_operand" "0")))]
-   "TARGET_SIMD"
--  "fmla\\t%0.2d, %2.2d, %1.2d[0]"
--  [(set_attr "type" "neon_fp_mla_d_scalar_q")]
-+  "fmla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
-+  [(set_attr "type" "neon<fp>_mla_<stype>_scalar<q>")]
- )
- 
- (define_insn "*aarch64_fma4_elt_to_64v2df"
-@@ -1608,15 +1654,15 @@
- )
- 
- (define_insn "fnma<mode>4"
--  [(set (match_operand:VDQF 0 "register_operand" "=w")
--	(fma:VDQF
--	  (match_operand:VDQF 1 "register_operand" "w")
--          (neg:VDQF
--	    (match_operand:VDQF 2 "register_operand" "w"))
--	  (match_operand:VDQF 3 "register_operand" "0")))]
-+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
-+	(fma:VHSDF
-+	  (match_operand:VHSDF 1 "register_operand" "w")
-+          (neg:VHSDF
-+	    (match_operand:VHSDF 2 "register_operand" "w"))
-+	  (match_operand:VHSDF 3 "register_operand" "0")))]
-   "TARGET_SIMD"
-- "fmls\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
--  [(set_attr "type" "neon_fp_mla_<Vetype><q>")]
-+  "fmls\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
-+  [(set_attr "type" "neon_fp_mla_<stype><q>")]
- )
- 
- (define_insn "*aarch64_fnma4_elt<mode>"
-@@ -1656,17 +1702,17 @@
-   [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
- )
- 
--(define_insn "*aarch64_fnma4_elt_to_128df"
--  [(set (match_operand:V2DF 0 "register_operand" "=w")
--    (fma:V2DF
--      (neg:V2DF
--        (match_operand:V2DF 2 "register_operand" "w"))
--      (vec_duplicate:V2DF
--	(match_operand:DF 1 "register_operand" "w"))
--      (match_operand:V2DF 3 "register_operand" "0")))]
-+(define_insn "*aarch64_fnma4_elt_from_dup<mode>"
-+  [(set (match_operand:VMUL 0 "register_operand" "=w")
-+    (fma:VMUL
-+      (neg:VMUL
-+        (match_operand:VMUL 2 "register_operand" "w"))
-+      (vec_duplicate:VMUL
-+	(match_operand:<VEL> 1 "register_operand" "w"))
-+      (match_operand:VMUL 3 "register_operand" "0")))]
-   "TARGET_SIMD"
--  "fmls\\t%0.2d, %2.2d, %1.2d[0]"
--  [(set_attr "type" "neon_fp_mla_d_scalar_q")]
-+  "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
-+  [(set_attr "type" "neon<fp>_mla_<stype>_scalar<q>")]
- )
- 
- (define_insn "*aarch64_fnma4_elt_to_64v2df"
-@@ -1689,24 +1735,50 @@
- ;; Vector versions of the floating-point frint patterns.
- ;; Expands to btrunc, ceil, floor, nearbyint, rint, round, frintn.
- (define_insn "<frint_pattern><mode>2"
--  [(set (match_operand:VDQF 0 "register_operand" "=w")
--	(unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w")]
--		      FRINT))]
-+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
-+	(unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")]
-+		       FRINT))]
-   "TARGET_SIMD"
-   "frint<frint_suffix>\\t%0.<Vtype>, %1.<Vtype>"
--  [(set_attr "type" "neon_fp_round_<Vetype><q>")]
-+  [(set_attr "type" "neon_fp_round_<stype><q>")]
- )
- 
- ;; Vector versions of the fcvt standard patterns.
- ;; Expands to lbtrunc, lround, lceil, lfloor
--(define_insn "l<fcvt_pattern><su_optab><VDQF:mode><fcvt_target>2"
-+(define_insn "l<fcvt_pattern><su_optab><VHSDF:mode><fcvt_target>2"
-   [(set (match_operand:<FCVT_TARGET> 0 "register_operand" "=w")
- 	(FIXUORS:<FCVT_TARGET> (unspec:<FCVT_TARGET>
--			       [(match_operand:VDQF 1 "register_operand" "w")]
-+			       [(match_operand:VHSDF 1 "register_operand" "w")]
- 			       FCVT)))]
-   "TARGET_SIMD"
-   "fcvt<frint_suffix><su>\\t%0.<Vtype>, %1.<Vtype>"
--  [(set_attr "type" "neon_fp_to_int_<Vetype><q>")]
-+  [(set_attr "type" "neon_fp_to_int_<stype><q>")]
-+)
-+
-+;; HF Scalar variants of related SIMD instructions.
-+(define_insn "l<fcvt_pattern><su_optab>hfhi2"
-+  [(set (match_operand:HI 0 "register_operand" "=w")
-+	(FIXUORS:HI (unspec:HF [(match_operand:HF 1 "register_operand" "w")]
-+		      FCVT)))]
-+  "TARGET_SIMD_F16INST"
-+  "fcvt<frint_suffix><su>\t%h0, %h1"
-+  [(set_attr "type" "neon_fp_to_int_s")]
-+)
-+
-+(define_insn "<optab>_trunchfhi2"
-+  [(set (match_operand:HI 0 "register_operand" "=w")
-+	(FIXUORS:HI (match_operand:HF 1 "register_operand" "w")))]
-+  "TARGET_SIMD_F16INST"
-+  "fcvtz<su>\t%h0, %h1"
-+  [(set_attr "type" "neon_fp_to_int_s")]
-+)
-+
-+(define_insn "<optab>hihf2"
-+  [(set (match_operand:HF 0 "register_operand" "=w")
-+	(FLOATUORS:HF (match_operand:HI 1 "register_operand" "w")))]
-+  "TARGET_SIMD_F16INST"
-+  "<su_optab>cvtf\t%h0, %h1"
-+  [(set_attr "type" "neon_int_to_fp_s")]
- )
- 
- (define_insn "*aarch64_fcvt<su_optab><VDQF:mode><fcvt_target>2_mult"
-@@ -1729,36 +1801,36 @@
-   [(set_attr "type" "neon_fp_to_int_<Vetype><q>")]
- )
- 
--(define_expand "<optab><VDQF:mode><fcvt_target>2"
-+(define_expand "<optab><VHSDF:mode><fcvt_target>2"
-   [(set (match_operand:<FCVT_TARGET> 0 "register_operand")
- 	(FIXUORS:<FCVT_TARGET> (unspec:<FCVT_TARGET>
--			       [(match_operand:VDQF 1 "register_operand")]
--			       UNSPEC_FRINTZ)))]
-+			       [(match_operand:VHSDF 1 "register_operand")]
-+				UNSPEC_FRINTZ)))]
-   "TARGET_SIMD"
-   {})
- 
--(define_expand "<fix_trunc_optab><VDQF:mode><fcvt_target>2"
-+(define_expand "<fix_trunc_optab><VHSDF:mode><fcvt_target>2"
-   [(set (match_operand:<FCVT_TARGET> 0 "register_operand")
- 	(FIXUORS:<FCVT_TARGET> (unspec:<FCVT_TARGET>
--			       [(match_operand:VDQF 1 "register_operand")]
--			       UNSPEC_FRINTZ)))]
-+			       [(match_operand:VHSDF 1 "register_operand")]
-+				UNSPEC_FRINTZ)))]
-   "TARGET_SIMD"
-   {})
- 
--(define_expand "ftrunc<VDQF:mode>2"
--  [(set (match_operand:VDQF 0 "register_operand")
--	(unspec:VDQF [(match_operand:VDQF 1 "register_operand")]
--		      UNSPEC_FRINTZ))]
-+(define_expand "ftrunc<VHSDF:mode>2"
-+  [(set (match_operand:VHSDF 0 "register_operand")
-+	(unspec:VHSDF [(match_operand:VHSDF 1 "register_operand")]
-+		       UNSPEC_FRINTZ))]
-   "TARGET_SIMD"
-   {})
- 
--(define_insn "<optab><fcvt_target><VDQF:mode>2"
--  [(set (match_operand:VDQF 0 "register_operand" "=w")
--	(FLOATUORS:VDQF
-+(define_insn "<optab><fcvt_target><VHSDF:mode>2"
-+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
-+	(FLOATUORS:VHSDF
- 	  (match_operand:<FCVT_TARGET> 1 "register_operand" "w")))]
-   "TARGET_SIMD"
-   "<su_optab>cvtf\\t%0.<Vtype>, %1.<Vtype>"
--  [(set_attr "type" "neon_int_to_fp_<Vetype><q>")]
-+  [(set_attr "type" "neon_int_to_fp_<stype><q>")]
- )
- 
- ;; Conversions between vectors of floats and doubles.
-@@ -1778,6 +1850,30 @@
-   [(set_attr "type" "neon_fp_cvt_widen_s")]
- )
- 
-+;; Convert between fixed-point and floating-point (vector modes)
-+
-+(define_insn "<FCVT_F2FIXED:fcvt_fixed_insn><VHSDF:mode>3"
-+  [(set (match_operand:<VHSDF:FCVT_TARGET> 0 "register_operand" "=w")
-+	(unspec:<VHSDF:FCVT_TARGET>
-+	  [(match_operand:VHSDF 1 "register_operand" "w")
-+	   (match_operand:SI 2 "immediate_operand" "i")]
-+	 FCVT_F2FIXED))]
-+  "TARGET_SIMD"
-+  "<FCVT_F2FIXED:fcvt_fixed_insn>\t%<v>0<Vmtype>, %<v>1<Vmtype>, #%2"
-+  [(set_attr "type" "neon_fp_to_int_<VHSDF:stype><q>")]
-+)
-+
-+(define_insn "<FCVT_FIXED2F:fcvt_fixed_insn><VDQ_HSDI:mode>3"
-+  [(set (match_operand:<VDQ_HSDI:FCVT_TARGET> 0 "register_operand" "=w")
-+	(unspec:<VDQ_HSDI:FCVT_TARGET>
-+	  [(match_operand:VDQ_HSDI 1 "register_operand" "w")
-+	   (match_operand:SI 2 "immediate_operand" "i")]
-+	 FCVT_FIXED2F))]
-+  "TARGET_SIMD"
-+  "<FCVT_FIXED2F:fcvt_fixed_insn>\t%<v>0<Vmtype>, %<v>1<Vmtype>, #%2"
-+  [(set_attr "type" "neon_int_to_fp_<VDQ_HSDI:stype><q>")]
-+)
-+
- ;; ??? Note that the vectorizer usage of the vec_unpacks_[lo/hi] patterns
- ;; is inconsistent with vector ordering elsewhere in the compiler, in that
- ;; the meaning of HI and LO changes depending on the target endianness.
-@@ -1934,33 +2030,25 @@
- ;; NaNs.
- 
- (define_insn "<su><maxmin><mode>3"
--  [(set (match_operand:VDQF 0 "register_operand" "=w")
--        (FMAXMIN:VDQF (match_operand:VDQF 1 "register_operand" "w")
--		   (match_operand:VDQF 2 "register_operand" "w")))]
-+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
-+	(FMAXMIN:VHSDF (match_operand:VHSDF 1 "register_operand" "w")
-+		       (match_operand:VHSDF 2 "register_operand" "w")))]
-   "TARGET_SIMD"
-   "f<maxmin>nm\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
--  [(set_attr "type" "neon_fp_minmax_<Vetype><q>")]
-+  [(set_attr "type" "neon_fp_minmax_<stype><q>")]
- )
- 
-+;; Vector forms for fmax, fmin, fmaxnm, fminnm.
-+;; fmaxnm and fminnm are used for the fmax<mode>3 standard pattern names,
-+;; which implement the IEEE fmax ()/fmin () functions.
- (define_insn "<maxmin_uns><mode>3"
--  [(set (match_operand:VDQF 0 "register_operand" "=w")
--       (unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w")
--		     (match_operand:VDQF 2 "register_operand" "w")]
--		    FMAXMIN_UNS))]
-+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
-+       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")
-+		      (match_operand:VHSDF 2 "register_operand" "w")]
-+		      FMAXMIN_UNS))]
-   "TARGET_SIMD"
-   "<maxmin_uns_op>\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
--  [(set_attr "type" "neon_fp_minmax_<Vetype><q>")]
--)
--
--;; Auto-vectorized forms for the IEEE-754 fmax()/fmin() functions
--(define_insn "<fmaxmin><mode>3"
--  [(set (match_operand:VDQF 0 "register_operand" "=w")
--	(unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w")
--		      (match_operand:VDQF 2 "register_operand" "w")]
--		      FMAXMIN))]
--  "TARGET_SIMD"
--  "<fmaxmin_op>\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
--  [(set_attr "type" "neon_fp_minmax_<Vetype><q>")]
-+  [(set_attr "type" "neon_fp_minmax_<stype><q>")]
- )
- 
- ;; 'across lanes' add.
-@@ -1979,17 +2067,14 @@
-   }
- )
- 
--(define_expand "reduc_plus_scal_<mode>"
--  [(match_operand:<VEL> 0 "register_operand" "=w")
--   (match_operand:V2F 1 "register_operand" "w")]
--  "TARGET_SIMD"
--  {
--    rtx elt = GEN_INT (ENDIAN_LANE_N (<MODE>mode, 0));
--    rtx scratch = gen_reg_rtx (<MODE>mode);
--    emit_insn (gen_aarch64_reduc_plus_internal<mode> (scratch, operands[1]));
--    emit_insn (gen_aarch64_get_lane<mode> (operands[0], scratch, elt));
--    DONE;
--  }
-+(define_insn "aarch64_faddp<mode>"
-+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
-+       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")
-+		      (match_operand:VHSDF 2 "register_operand" "w")]
-+	UNSPEC_FADDV))]
-+ "TARGET_SIMD"
-+ "faddp\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
-+  [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
- )
- 
- (define_insn "aarch64_reduc_plus_internal<mode>"
-@@ -2010,24 +2095,15 @@
-   [(set_attr "type" "neon_reduc_add")]
- )
- 
--(define_insn "aarch64_reduc_plus_internal<mode>"
-- [(set (match_operand:V2F 0 "register_operand" "=w")
--       (unspec:V2F [(match_operand:V2F 1 "register_operand" "w")]
-+(define_insn "reduc_plus_scal_<mode>"
-+ [(set (match_operand:<VEL> 0 "register_operand" "=w")
-+       (unspec:<VEL> [(match_operand:V2F 1 "register_operand" "w")]
- 		   UNSPEC_FADDV))]
-  "TARGET_SIMD"
-  "faddp\\t%<Vetype>0, %1.<Vtype>"
-   [(set_attr "type" "neon_fp_reduc_add_<Vetype><q>")]
- )
- 
--(define_insn "aarch64_addpv4sf"
-- [(set (match_operand:V4SF 0 "register_operand" "=w")
--       (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "w")]
--		    UNSPEC_FADDV))]
-- "TARGET_SIMD"
-- "faddp\\t%0.4s, %1.4s, %1.4s"
--  [(set_attr "type" "neon_fp_reduc_add_s_q")]
--)
--
- (define_expand "reduc_plus_scal_v4sf"
-  [(set (match_operand:SF 0 "register_operand")
-        (unspec:V4SF [(match_operand:V4SF 1 "register_operand")]
-@@ -2036,8 +2112,8 @@
- {
-   rtx elt = GEN_INT (ENDIAN_LANE_N (V4SFmode, 0));
-   rtx scratch = gen_reg_rtx (V4SFmode);
--  emit_insn (gen_aarch64_addpv4sf (scratch, operands[1]));
--  emit_insn (gen_aarch64_addpv4sf (scratch, scratch));
-+  emit_insn (gen_aarch64_faddpv4sf (scratch, operands[1], operands[1]));
-+  emit_insn (gen_aarch64_faddpv4sf (scratch, scratch, scratch));
-   emit_insn (gen_aarch64_get_lanev4sf (operands[0], scratch, elt));
-   DONE;
- })
-@@ -2072,8 +2148,8 @@
- ;; gimple_fold'd to the REDUC_(MAX|MIN)_EXPR tree code.  (This is FP smax/smin).
- (define_expand "reduc_<maxmin_uns>_scal_<mode>"
-   [(match_operand:<VEL> 0 "register_operand")
--   (unspec:VDQF [(match_operand:VDQF 1 "register_operand")]
--		FMAXMINV)]
-+   (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand")]
-+		  FMAXMINV)]
-   "TARGET_SIMD"
-   {
-     rtx elt = GEN_INT (ENDIAN_LANE_N (<MODE>mode, 0));
-@@ -2120,12 +2196,12 @@
- )
- 
- (define_insn "aarch64_reduc_<maxmin_uns>_internal<mode>"
-- [(set (match_operand:VDQF 0 "register_operand" "=w")
--       (unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w")]
--		    FMAXMINV))]
-+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
-+       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")]
-+		      FMAXMINV))]
-  "TARGET_SIMD"
-  "<maxmin_uns_op><vp>\\t%<Vetype>0, %1.<Vtype>"
--  [(set_attr "type" "neon_fp_reduc_minmax_<Vetype><q>")]
-+  [(set_attr "type" "neon_fp_reduc_minmax_<stype><q>")]
- )
- 
- ;; aarch64_simd_bsl may compile to any of bsl/bif/bit depending on register
-@@ -2635,7 +2711,7 @@
- (define_insn "*aarch64_combinez<mode>"
-   [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
-         (vec_concat:<VDBL>
--	   (match_operand:VD_BHSI 1 "general_operand" "w,r,m")
-+	   (match_operand:VD_BHSI 1 "general_operand" "w,?r,m")
- 	   (match_operand:VD_BHSI 2 "aarch64_simd_imm_zero" "Dz,Dz,Dz")))]
-   "TARGET_SIMD && !BYTES_BIG_ENDIAN"
-   "@
-@@ -2651,7 +2727,7 @@
-   [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
-         (vec_concat:<VDBL>
- 	   (match_operand:VD_BHSI 2 "aarch64_simd_imm_zero" "Dz,Dz,Dz")
--	   (match_operand:VD_BHSI 1 "general_operand" "w,r,m")))]
-+	   (match_operand:VD_BHSI 1 "general_operand" "w,?r,m")))]
-   "TARGET_SIMD && BYTES_BIG_ENDIAN"
-   "@
-    mov\\t%0.8b, %1.8b
-@@ -2994,13 +3070,14 @@
- ;; fmulx.
- 
- (define_insn "aarch64_fmulx<mode>"
--  [(set (match_operand:VALLF 0 "register_operand" "=w")
--	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
--		       (match_operand:VALLF 2 "register_operand" "w")]
--		      UNSPEC_FMULX))]
-+  [(set (match_operand:VHSDF_HSDF 0 "register_operand" "=w")
-+	(unspec:VHSDF_HSDF
-+	  [(match_operand:VHSDF_HSDF 1 "register_operand" "w")
-+	   (match_operand:VHSDF_HSDF 2 "register_operand" "w")]
-+	   UNSPEC_FMULX))]
-  "TARGET_SIMD"
-  "fmulx\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
-- [(set_attr "type" "neon_fp_mul_<Vetype>")]
-+ [(set_attr "type" "neon_fp_mul_<stype>")]
- )
- 
- ;; vmulxq_lane_f32, and vmulx_laneq_f32
-@@ -3042,20 +3119,18 @@
-   [(set_attr "type" "neon_fp_mul_<Vetype><q>")]
- )
- 
--;; vmulxq_lane_f64
-+;; vmulxq_lane
- 
--(define_insn "*aarch64_mulx_elt_to_64v2df"
--  [(set (match_operand:V2DF 0 "register_operand" "=w")
--	(unspec:V2DF
--	 [(match_operand:V2DF 1 "register_operand" "w")
--	  (vec_duplicate:V2DF
--	    (match_operand:DF 2 "register_operand" "w"))]
-+(define_insn "*aarch64_mulx_elt_from_dup<mode>"
-+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
-+	(unspec:VHSDF
-+	 [(match_operand:VHSDF 1 "register_operand" "w")
-+	  (vec_duplicate:VHSDF
-+	    (match_operand:<VEL> 2 "register_operand" "w"))]
- 	 UNSPEC_FMULX))]
-   "TARGET_SIMD"
--  {
--    return "fmulx\t%0.2d, %1.2d, %2.d[0]";
--  }
--  [(set_attr "type" "neon_fp_mul_d_scalar_q")]
-+  "fmulx\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]";
-+  [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")]
- )
- 
- ;; vmulxs_lane_f32, vmulxs_laneq_f32
-@@ -3937,15 +4012,12 @@
- 			   "aarch64_simd_shift_imm_bitsize_<ve_mode>" "i")]
-                          VSHLL))]
-   "TARGET_SIMD"
--  "*
--  int bit_width = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
--  if (INTVAL (operands[2]) == bit_width)
-   {
--    return \"shll\\t%0.<Vwtype>, %1.<Vtype>, %2\";
-+    if (INTVAL (operands[2]) == GET_MODE_UNIT_BITSIZE (<MODE>mode))
-+      return "shll\\t%0.<Vwtype>, %1.<Vtype>, %2";
-+    else
-+      return "<sur>shll\\t%0.<Vwtype>, %1.<Vtype>, %2";
-   }
--  else {
--    return \"<sur>shll\\t%0.<Vwtype>, %1.<Vtype>, %2\";
--  }"
-   [(set_attr "type" "neon_shift_imm_long")]
- )
- 
-@@ -3957,15 +4029,12 @@
- 			 (match_operand:SI 2 "immediate_operand" "i")]
-                          VSHLL))]
-   "TARGET_SIMD"
--  "*
--  int bit_width = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
--  if (INTVAL (operands[2]) == bit_width)
-   {
--    return \"shll2\\t%0.<Vwtype>, %1.<Vtype>, %2\";
-+    if (INTVAL (operands[2]) == GET_MODE_UNIT_BITSIZE (<MODE>mode))
-+      return "shll2\\t%0.<Vwtype>, %1.<Vtype>, %2";
-+    else
-+      return "<sur>shll2\\t%0.<Vwtype>, %1.<Vtype>, %2";
-   }
--  else {
--    return \"<sur>shll2\\t%0.<Vwtype>, %1.<Vtype>, %2\";
--  }"
-   [(set_attr "type" "neon_shift_imm_long")]
- )
- 
-@@ -4246,30 +4315,32 @@
-   [(set (match_operand:<V_cmp_result> 0 "register_operand" "=w,w")
- 	(neg:<V_cmp_result>
- 	  (COMPARISONS:<V_cmp_result>
--	    (match_operand:VALLF 1 "register_operand" "w,w")
--	    (match_operand:VALLF 2 "aarch64_simd_reg_or_zero" "w,YDz")
-+	    (match_operand:VHSDF_HSDF 1 "register_operand" "w,w")
-+	    (match_operand:VHSDF_HSDF 2 "aarch64_simd_reg_or_zero" "w,YDz")
- 	  )))]
-   "TARGET_SIMD"
-   "@
-   fcm<n_optab>\t%<v>0<Vmtype>, %<v><cmp_1><Vmtype>, %<v><cmp_2><Vmtype>
-   fcm<optab>\t%<v>0<Vmtype>, %<v>1<Vmtype>, 0"
--  [(set_attr "type" "neon_fp_compare_<Vetype><q>")]
-+  [(set_attr "type" "neon_fp_compare_<stype><q>")]
- )
- 
- ;; fac(ge|gt)
- ;; Note we can also handle what would be fac(le|lt) by
- ;; generating fac(ge|gt).
- 
--(define_insn "*aarch64_fac<optab><mode>"
-+(define_insn "aarch64_fac<optab><mode>"
-   [(set (match_operand:<V_cmp_result> 0 "register_operand" "=w")
- 	(neg:<V_cmp_result>
- 	  (FAC_COMPARISONS:<V_cmp_result>
--	    (abs:VALLF (match_operand:VALLF 1 "register_operand" "w"))
--	    (abs:VALLF (match_operand:VALLF 2 "register_operand" "w"))
-+	    (abs:VHSDF_HSDF
-+	      (match_operand:VHSDF_HSDF 1 "register_operand" "w"))
-+	    (abs:VHSDF_HSDF
-+	      (match_operand:VHSDF_HSDF 2 "register_operand" "w"))
-   )))]
-   "TARGET_SIMD"
-   "fac<n_optab>\t%<v>0<Vmtype>, %<v><cmp_1><Vmtype>, %<v><cmp_2><Vmtype>"
--  [(set_attr "type" "neon_fp_compare_<Vetype><q>")]
-+  [(set_attr "type" "neon_fp_compare_<stype><q>")]
- )
- 
- ;; addp
-@@ -4297,12 +4368,21 @@
- 
- ;; sqrt
- 
--(define_insn "sqrt<mode>2"
--  [(set (match_operand:VDQF 0 "register_operand" "=w")
--        (sqrt:VDQF (match_operand:VDQF 1 "register_operand" "w")))]
-+(define_expand "sqrt<mode>2"
-+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
-+	(sqrt:VHSDF (match_operand:VHSDF 1 "register_operand" "w")))]
-+  "TARGET_SIMD"
-+{
-+  if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
-+    DONE;
-+})
-+
-+(define_insn "*sqrt<mode>2"
-+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
-+	(sqrt:VHSDF (match_operand:VHSDF 1 "register_operand" "w")))]
-   "TARGET_SIMD"
-   "fsqrt\\t%0.<Vtype>, %1.<Vtype>"
--  [(set_attr "type" "neon_fp_sqrt_<Vetype><q>")]
-+  [(set_attr "type" "neon_fp_sqrt_<stype><q>")]
- )
- 
- ;; Patterns for vector struct loads and stores.
-@@ -4652,7 +4732,7 @@
-    ld1\\t{%S0.16b - %<Vendreg>0.16b}, %1"
-   [(set_attr "type" "multiple,neon_store<nregs>_<nregs>reg_q,\
- 		     neon_load<nregs>_<nregs>reg_q")
--   (set (attr "length") (symbol_ref "aarch64_simd_attr_length_move (insn)"))]
-+   (set_attr "length" "<insn_count>,4,4")]
- )
- 
- (define_insn "aarch64_be_ld1<mode>"
-@@ -4685,7 +4765,7 @@
-    stp\\t%q1, %R1, %0
-    ldp\\t%q0, %R0, %1"
-   [(set_attr "type" "multiple,neon_stp_q,neon_ldp_q")
--   (set (attr "length") (symbol_ref "aarch64_simd_attr_length_move (insn)"))]
-+   (set_attr "length" "8,4,4")]
- )
- 
- (define_insn "*aarch64_be_movci"
-@@ -4696,7 +4776,7 @@
-        || register_operand (operands[1], CImode))"
-   "#"
-   [(set_attr "type" "multiple")
--   (set (attr "length") (symbol_ref "aarch64_simd_attr_length_move (insn)"))]
-+   (set_attr "length" "12,4,4")]
- )
- 
- (define_insn "*aarch64_be_movxi"
-@@ -4707,7 +4787,7 @@
-        || register_operand (operands[1], XImode))"
-   "#"
-   [(set_attr "type" "multiple")
--   (set (attr "length") (symbol_ref "aarch64_simd_attr_length_move (insn)"))]
-+   (set_attr "length" "16,4,4")]
- )
- 
- (define_split
-@@ -4787,7 +4867,7 @@
-   DONE;
- })
- 
--(define_insn "aarch64_ld2<mode>_dreg"
-+(define_insn "aarch64_ld2<mode>_dreg_le"
-   [(set (match_operand:OI 0 "register_operand" "=w")
- 	(subreg:OI
- 	  (vec_concat:<VRL2>
-@@ -4800,12 +4880,30 @@
- 	     (unspec:VD [(match_dup 1)]
- 			UNSPEC_LD2)
- 	     (vec_duplicate:VD (const_int 0)))) 0))]
--  "TARGET_SIMD"
-+  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
-   "ld2\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
-   [(set_attr "type" "neon_load2_2reg<q>")]
- )
- 
--(define_insn "aarch64_ld2<mode>_dreg"
-+(define_insn "aarch64_ld2<mode>_dreg_be"
-+  [(set (match_operand:OI 0 "register_operand" "=w")
-+	(subreg:OI
-+	  (vec_concat:<VRL2>
-+	    (vec_concat:<VDBL>
-+	     (vec_duplicate:VD (const_int 0))
-+	     (unspec:VD
-+		[(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
-+		UNSPEC_LD2))
-+	    (vec_concat:<VDBL>
-+	     (vec_duplicate:VD (const_int 0))
-+	     (unspec:VD [(match_dup 1)]
-+			UNSPEC_LD2))) 0))]
-+  "TARGET_SIMD && BYTES_BIG_ENDIAN"
-+  "ld2\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
-+  [(set_attr "type" "neon_load2_2reg<q>")]
-+)
-+
-+(define_insn "aarch64_ld2<mode>_dreg_le"
-   [(set (match_operand:OI 0 "register_operand" "=w")
- 	(subreg:OI
- 	  (vec_concat:<VRL2>
-@@ -4818,12 +4916,30 @@
- 	     (unspec:DX [(match_dup 1)]
- 			UNSPEC_LD2)
- 	     (const_int 0))) 0))]
--  "TARGET_SIMD"
-+  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
-   "ld1\\t{%S0.1d - %T0.1d}, %1"
-   [(set_attr "type" "neon_load1_2reg<q>")]
- )
- 
--(define_insn "aarch64_ld3<mode>_dreg"
-+(define_insn "aarch64_ld2<mode>_dreg_be"
-+  [(set (match_operand:OI 0 "register_operand" "=w")
-+	(subreg:OI
-+	  (vec_concat:<VRL2>
-+	    (vec_concat:<VDBL>
-+	     (const_int 0)
-+	     (unspec:DX
-+		[(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
-+		UNSPEC_LD2))
-+	    (vec_concat:<VDBL>
-+	     (const_int 0)
-+	     (unspec:DX [(match_dup 1)]
-+			UNSPEC_LD2))) 0))]
-+  "TARGET_SIMD && BYTES_BIG_ENDIAN"
-+  "ld1\\t{%S0.1d - %T0.1d}, %1"
-+  [(set_attr "type" "neon_load1_2reg<q>")]
-+)
-+
-+(define_insn "aarch64_ld3<mode>_dreg_le"
-   [(set (match_operand:CI 0 "register_operand" "=w")
- 	(subreg:CI
- 	 (vec_concat:<VRL3>
-@@ -4841,12 +4957,35 @@
- 	     (unspec:VD [(match_dup 1)]
- 			UNSPEC_LD3)
- 	     (vec_duplicate:VD (const_int 0)))) 0))]
--  "TARGET_SIMD"
-+  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
-+  "ld3\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
-+  [(set_attr "type" "neon_load3_3reg<q>")]
-+)
-+
-+(define_insn "aarch64_ld3<mode>_dreg_be"
-+  [(set (match_operand:CI 0 "register_operand" "=w")
-+	(subreg:CI
-+	 (vec_concat:<VRL3>
-+	  (vec_concat:<VRL2>
-+	    (vec_concat:<VDBL>
-+	     (vec_duplicate:VD (const_int 0))
-+	     (unspec:VD
-+		[(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
-+		UNSPEC_LD3))
-+	    (vec_concat:<VDBL>
-+	     (vec_duplicate:VD (const_int 0))
-+	     (unspec:VD [(match_dup 1)]
-+			UNSPEC_LD3)))
-+	  (vec_concat:<VDBL>
-+	     (vec_duplicate:VD (const_int 0))
-+	     (unspec:VD [(match_dup 1)]
-+			UNSPEC_LD3))) 0))]
-+  "TARGET_SIMD && BYTES_BIG_ENDIAN"
-   "ld3\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
-   [(set_attr "type" "neon_load3_3reg<q>")]
- )
- 
--(define_insn "aarch64_ld3<mode>_dreg"
-+(define_insn "aarch64_ld3<mode>_dreg_le"
-   [(set (match_operand:CI 0 "register_operand" "=w")
- 	(subreg:CI
- 	 (vec_concat:<VRL3>
-@@ -4864,12 +5003,35 @@
- 	     (unspec:DX [(match_dup 1)]
- 			UNSPEC_LD3)
- 	     (const_int 0))) 0))]
--  "TARGET_SIMD"
-+  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
-   "ld1\\t{%S0.1d - %U0.1d}, %1"
-   [(set_attr "type" "neon_load1_3reg<q>")]
- )
- 
--(define_insn "aarch64_ld4<mode>_dreg"
-+(define_insn "aarch64_ld3<mode>_dreg_be"
-+  [(set (match_operand:CI 0 "register_operand" "=w")
-+	(subreg:CI
-+	 (vec_concat:<VRL3>
-+	  (vec_concat:<VRL2>
-+	    (vec_concat:<VDBL>
-+	     (const_int 0)
-+	     (unspec:DX
-+		[(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
-+		UNSPEC_LD3))
-+	    (vec_concat:<VDBL>
-+	     (const_int 0)
-+	     (unspec:DX [(match_dup 1)]
-+			UNSPEC_LD3)))
-+	  (vec_concat:<VDBL>
-+	     (const_int 0)
-+	     (unspec:DX [(match_dup 1)]
-+			UNSPEC_LD3))) 0))]
-+  "TARGET_SIMD && BYTES_BIG_ENDIAN"
-+  "ld1\\t{%S0.1d - %U0.1d}, %1"
-+  [(set_attr "type" "neon_load1_3reg<q>")]
-+)
-+
-+(define_insn "aarch64_ld4<mode>_dreg_le"
-   [(set (match_operand:XI 0 "register_operand" "=w")
- 	(subreg:XI
- 	 (vec_concat:<VRL4>
-@@ -4880,9 +5042,9 @@
- 		UNSPEC_LD4)
- 	       (vec_duplicate:VD (const_int 0)))
- 	      (vec_concat:<VDBL>
--	        (unspec:VD [(match_dup 1)]
-+		(unspec:VD [(match_dup 1)]
- 			UNSPEC_LD4)
--	        (vec_duplicate:VD (const_int 0))))
-+		(vec_duplicate:VD (const_int 0))))
- 	   (vec_concat:<VRL2>
- 	     (vec_concat:<VDBL>
- 	       (unspec:VD [(match_dup 1)]
-@@ -4892,12 +5054,40 @@
- 	       (unspec:VD [(match_dup 1)]
- 			UNSPEC_LD4)
- 	       (vec_duplicate:VD (const_int 0))))) 0))]
--  "TARGET_SIMD"
-+  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
-+  "ld4\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
-+  [(set_attr "type" "neon_load4_4reg<q>")]
-+)
-+
-+(define_insn "aarch64_ld4<mode>_dreg_be"
-+  [(set (match_operand:XI 0 "register_operand" "=w")
-+	(subreg:XI
-+	 (vec_concat:<VRL4>
-+	   (vec_concat:<VRL2>
-+	     (vec_concat:<VDBL>
-+	       (vec_duplicate:VD (const_int 0))
-+	       (unspec:VD
-+		[(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
-+		UNSPEC_LD4))
-+	      (vec_concat:<VDBL>
-+		(vec_duplicate:VD (const_int 0))
-+		(unspec:VD [(match_dup 1)]
-+			UNSPEC_LD4)))
-+	   (vec_concat:<VRL2>
-+	     (vec_concat:<VDBL>
-+	       (vec_duplicate:VD (const_int 0))
-+	       (unspec:VD [(match_dup 1)]
-+			UNSPEC_LD4))
-+	     (vec_concat:<VDBL>
-+	       (vec_duplicate:VD (const_int 0))
-+	       (unspec:VD [(match_dup 1)]
-+			UNSPEC_LD4)))) 0))]
-+  "TARGET_SIMD && BYTES_BIG_ENDIAN"
-   "ld4\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
-   [(set_attr "type" "neon_load4_4reg<q>")]
- )
- 
--(define_insn "aarch64_ld4<mode>_dreg"
-+(define_insn "aarch64_ld4<mode>_dreg_le"
-   [(set (match_operand:XI 0 "register_operand" "=w")
- 	(subreg:XI
- 	 (vec_concat:<VRL4>
-@@ -4910,7 +5100,7 @@
- 	      (vec_concat:<VDBL>
- 	        (unspec:DX [(match_dup 1)]
- 			UNSPEC_LD4)
--	        (const_int 0)))
-+		(const_int 0)))
- 	   (vec_concat:<VRL2>
- 	     (vec_concat:<VDBL>
- 	       (unspec:DX [(match_dup 1)]
-@@ -4920,7 +5110,35 @@
- 	       (unspec:DX [(match_dup 1)]
- 			UNSPEC_LD4)
- 	       (const_int 0)))) 0))]
--  "TARGET_SIMD"
-+  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
-+  "ld1\\t{%S0.1d - %V0.1d}, %1"
-+  [(set_attr "type" "neon_load1_4reg<q>")]
-+)
-+
-+(define_insn "aarch64_ld4<mode>_dreg_be"
-+  [(set (match_operand:XI 0 "register_operand" "=w")
-+	(subreg:XI
-+	 (vec_concat:<VRL4>
-+	   (vec_concat:<VRL2>
-+	     (vec_concat:<VDBL>
-+	       (const_int 0)
-+	       (unspec:DX
-+		[(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")]
-+		UNSPEC_LD4))
-+	      (vec_concat:<VDBL>
-+		(const_int 0)
-+		(unspec:DX [(match_dup 1)]
-+			UNSPEC_LD4)))
-+	   (vec_concat:<VRL2>
-+	     (vec_concat:<VDBL>
-+	       (const_int 0)
-+	       (unspec:DX [(match_dup 1)]
-+			UNSPEC_LD4))
-+	     (vec_concat:<VDBL>
-+	       (const_int 0)
-+	       (unspec:DX [(match_dup 1)]
-+			UNSPEC_LD4)))) 0))]
-+  "TARGET_SIMD && BYTES_BIG_ENDIAN"
-   "ld1\\t{%S0.1d - %V0.1d}, %1"
-   [(set_attr "type" "neon_load1_4reg<q>")]
- )
-@@ -4934,7 +5152,12 @@
-   rtx mem = gen_rtx_MEM (BLKmode, operands[1]);
-   set_mem_size (mem, <VSTRUCT:nregs> * 8);
- 
--  emit_insn (gen_aarch64_ld<VSTRUCT:nregs><VDC:mode>_dreg (operands[0], mem));
-+  if (BYTES_BIG_ENDIAN)
-+    emit_insn (gen_aarch64_ld<VSTRUCT:nregs><VDC:mode>_dreg_be (operands[0],
-+								mem));
-+  else
-+    emit_insn (gen_aarch64_ld<VSTRUCT:nregs><VDC:mode>_dreg_le (operands[0],
-+								mem));
-   DONE;
- })
- 
-@@ -5160,10 +5383,10 @@
- )
- 
- (define_insn "aarch64_<PERMUTE:perm_insn><PERMUTE:perm_hilo><mode>"
--  [(set (match_operand:VALL 0 "register_operand" "=w")
--	(unspec:VALL [(match_operand:VALL 1 "register_operand" "w")
--		      (match_operand:VALL 2 "register_operand" "w")]
--		       PERMUTE))]
-+  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
-+	(unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")
-+			  (match_operand:VALL_F16 2 "register_operand" "w")]
-+	 PERMUTE))]
-   "TARGET_SIMD"
-   "<PERMUTE:perm_insn><PERMUTE:perm_hilo>\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
-   [(set_attr "type" "neon_permute<q>")]
-@@ -5171,11 +5394,11 @@
- 
- ;; Note immediate (third) operand is lane index not byte index.
- (define_insn "aarch64_ext<mode>"
--  [(set (match_operand:VALL 0 "register_operand" "=w")
--        (unspec:VALL [(match_operand:VALL 1 "register_operand" "w")
--                      (match_operand:VALL 2 "register_operand" "w")
--                      (match_operand:SI 3 "immediate_operand" "i")]
--                     UNSPEC_EXT))]
-+  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
-+        (unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")
-+			  (match_operand:VALL_F16 2 "register_operand" "w")
-+			  (match_operand:SI 3 "immediate_operand" "i")]
-+	 UNSPEC_EXT))]
-   "TARGET_SIMD"
- {
-   operands[3] = GEN_INT (INTVAL (operands[3])
-@@ -5186,8 +5409,8 @@
- )
- 
- (define_insn "aarch64_rev<REVERSE:rev_op><mode>"
--  [(set (match_operand:VALL 0 "register_operand" "=w")
--	(unspec:VALL [(match_operand:VALL 1 "register_operand" "w")]
-+  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
-+	(unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")]
-                     REVERSE))]
-   "TARGET_SIMD"
-   "rev<REVERSE:rev_op>\\t%0.<Vtype>, %1.<Vtype>"
-@@ -5354,31 +5577,32 @@
- )
- 
- (define_insn "aarch64_frecpe<mode>"
--  [(set (match_operand:VDQF 0 "register_operand" "=w")
--	(unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w")]
--		    UNSPEC_FRECPE))]
-+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
-+	(unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")]
-+	 UNSPEC_FRECPE))]
-   "TARGET_SIMD"
-   "frecpe\\t%0.<Vtype>, %1.<Vtype>"
--  [(set_attr "type" "neon_fp_recpe_<Vetype><q>")]
-+  [(set_attr "type" "neon_fp_recpe_<stype><q>")]
- )
- 
- (define_insn "aarch64_frecp<FRECP:frecp_suffix><mode>"
--  [(set (match_operand:GPF 0 "register_operand" "=w")
--	(unspec:GPF [(match_operand:GPF 1 "register_operand" "w")]
--		    FRECP))]
-+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
-+	(unspec:GPF_F16 [(match_operand:GPF_F16 1 "register_operand" "w")]
-+	 FRECP))]
-   "TARGET_SIMD"
-   "frecp<FRECP:frecp_suffix>\\t%<s>0, %<s>1"
--  [(set_attr "type" "neon_fp_recp<FRECP:frecp_suffix>_<GPF:Vetype><GPF:q>")]
-+  [(set_attr "type" "neon_fp_recp<FRECP:frecp_suffix>_<GPF_F16:stype>")]
- )
- 
- (define_insn "aarch64_frecps<mode>"
--  [(set (match_operand:VALLF 0 "register_operand" "=w")
--	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
--		     (match_operand:VALLF 2 "register_operand" "w")]
--		    UNSPEC_FRECPS))]
-+  [(set (match_operand:VHSDF_HSDF 0 "register_operand" "=w")
-+	(unspec:VHSDF_HSDF
-+	  [(match_operand:VHSDF_HSDF 1 "register_operand" "w")
-+	  (match_operand:VHSDF_HSDF 2 "register_operand" "w")]
-+	  UNSPEC_FRECPS))]
-   "TARGET_SIMD"
-   "frecps\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
--  [(set_attr "type" "neon_fp_recps_<Vetype><q>")]
-+  [(set_attr "type" "neon_fp_recps_<stype><q>")]
- )
- 
- (define_insn "aarch64_urecpe<mode>"
-@@ -5414,13 +5638,25 @@
-   [(set_attr "type" "crypto_aese")]
- )
- 
-+;; When AES/AESMC fusion is enabled we want the register allocation to
-+;; look like:
-+;;    AESE Vn, _
-+;;    AESMC Vn, Vn
-+;; So prefer to tie operand 1 to operand 0 when fusing.
-+
- (define_insn "aarch64_crypto_aes<aesmc_op>v16qi"
--  [(set (match_operand:V16QI 0 "register_operand" "=w")
--	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "w")]
-+  [(set (match_operand:V16QI 0 "register_operand" "=w,w")
-+	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0,w")]
- 	 CRYPTO_AESMC))]
-   "TARGET_SIMD && TARGET_CRYPTO"
-   "aes<aesmc_op>\\t%0.16b, %1.16b"
--  [(set_attr "type" "crypto_aesmc")]
-+  [(set_attr "type" "crypto_aesmc")
-+   (set_attr_alternative "enabled"
-+     [(if_then_else (match_test
-+		       "aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)")
-+		     (const_string "yes" )
-+		     (const_string "no"))
-+      (const_string "yes")])]
- )
- 
- ;; sha1
-@@ -5435,6 +5671,26 @@
-   [(set_attr "type" "crypto_sha1_fast")]
- )
- 
-+(define_insn "aarch64_crypto_sha1hv4si"
-+  [(set (match_operand:SI 0 "register_operand" "=w")
-+	(unspec:SI [(vec_select:SI (match_operand:V4SI 1 "register_operand" "w")
-+		     (parallel [(const_int 0)]))]
-+	 UNSPEC_SHA1H))]
-+  "TARGET_SIMD && TARGET_CRYPTO && !BYTES_BIG_ENDIAN"
-+  "sha1h\\t%s0, %s1"
-+  [(set_attr "type" "crypto_sha1_fast")]
-+)
-+
-+(define_insn "aarch64_be_crypto_sha1hv4si"
-+  [(set (match_operand:SI 0 "register_operand" "=w")
-+	(unspec:SI [(vec_select:SI (match_operand:V4SI 1 "register_operand" "w")
-+		     (parallel [(const_int 3)]))]
-+	 UNSPEC_SHA1H))]
-+  "TARGET_SIMD && TARGET_CRYPTO && BYTES_BIG_ENDIAN"
-+  "sha1h\\t%s0, %s1"
-+  [(set_attr "type" "crypto_sha1_fast")]
-+)
-+
- (define_insn "aarch64_crypto_sha1su1v4si"
-   [(set (match_operand:V4SI 0 "register_operand" "=w")
-         (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "0")
---- a/src/gcc/config/aarch64/aarch64-tune.md
-+++ b/src/gcc/config/aarch64/aarch64-tune.md
-@@ -1,5 +1,5 @@
- ;; -*- buffer-read-only: t -*-
- ;; Generated automatically by gentune.sh from aarch64-cores.def
- (define_attr "tune"
--	"cortexa35,cortexa53,cortexa57,cortexa72,exynosm1,qdf24xx,thunderx,xgene1,cortexa57cortexa53,cortexa72cortexa53"
-+	"cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,exynosm1,qdf24xx,thunderx,xgene1,vulcan,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53"
- 	(const (symbol_ref "((enum attr_tune) aarch64_tune)")))
---- a/src/gcc/config/aarch64/aarch64-tuning-flags.def
-+++ b/src/gcc/config/aarch64/aarch64-tuning-flags.def
-@@ -29,5 +29,8 @@
-      AARCH64_TUNE_ to give an enum name. */
- 
- AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
--AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT)
- 
-+/* Don't create non-8 byte aligned load/store pair.  That is if the
-+two load/stores are not at least 8 byte aligned don't create load/store
-+pairs.   */
-+AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", SLOW_UNALIGNED_LDPW)
---- a/src/gcc/config/aarch64/aarch64.c
-+++ b/src/gcc/config/aarch64/aarch64.c
-@@ -26,6 +26,7 @@
- #include "target.h"
- #include "rtl.h"
- #include "tree.h"
-+#include "memmodel.h"
- #include "gimple.h"
- #include "cfghooks.h"
- #include "cfgloop.h"
-@@ -61,7 +62,6 @@
- #include "rtl-iter.h"
- #include "tm-constrs.h"
- #include "sched-int.h"
--#include "cortex-a57-fma-steering.h"
- #include "target-globals.h"
- #include "common/common-target.h"
- 
-@@ -141,6 +141,10 @@ static bool aarch64_vector_mode_supported_p (machine_mode);
- static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
- 						 const unsigned char *sel);
- static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
-+static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
-+							 const_tree type,
-+							 int misalignment,
-+							 bool is_packed);
- 
- /* Major revision number of the ARM Architecture implemented by the target.  */
- unsigned aarch64_architecture_version;
-@@ -152,7 +156,7 @@ enum aarch64_processor aarch64_tune = cortexa53;
- unsigned long aarch64_tune_flags = 0;
- 
- /* Global flag for PC relative loads.  */
--bool aarch64_nopcrelative_literal_loads;
-+bool aarch64_pcrelative_literal_loads;
- 
- /* Support for command line parsing of boolean flags in the tuning
-    structures.  */
-@@ -250,6 +254,38 @@ static const struct cpu_addrcost_table xgene1_addrcost_table =
-   0, /* imm_offset  */
- };
- 
-+static const struct cpu_addrcost_table qdf24xx_addrcost_table =
-+{
-+    {
-+      1, /* hi  */
-+      0, /* si  */
-+      0, /* di  */
-+      1, /* ti  */
-+    },
-+  0, /* pre_modify  */
-+  0, /* post_modify  */
-+  0, /* register_offset  */
-+  0, /* register_sextend  */
-+  0, /* register_zextend  */
-+  0 /* imm_offset  */
-+};
-+
-+static const struct cpu_addrcost_table vulcan_addrcost_table =
-+{
-+    {
-+      0, /* hi  */
-+      0, /* si  */
-+      0, /* di  */
-+      2, /* ti  */
-+    },
-+  0, /* pre_modify  */
-+  0, /* post_modify  */
-+  2, /* register_offset  */
-+  3, /* register_sextend  */
-+  3, /* register_zextend  */
-+  0, /* imm_offset  */
-+};
-+
- static const struct cpu_regmove_cost generic_regmove_cost =
- {
-   1, /* GP2GP  */
-@@ -308,6 +344,24 @@ static const struct cpu_regmove_cost xgene1_regmove_cost =
-   2 /* FP2FP  */
- };
- 
-+static const struct cpu_regmove_cost qdf24xx_regmove_cost =
-+{
-+  2, /* GP2GP  */
-+  /* Avoid the use of int<->fp moves for spilling.  */
-+  6, /* GP2FP  */
-+  6, /* FP2GP  */
-+  4 /* FP2FP  */
-+};
-+
-+static const struct cpu_regmove_cost vulcan_regmove_cost =
-+{
-+  1, /* GP2GP  */
-+  /* Avoid the use of int<->fp moves for spilling.  */
-+  8, /* GP2FP  */
-+  8, /* FP2GP  */
-+  4  /* FP2FP  */
-+};
-+
- /* Generic costs for vector insn classes.  */
- static const struct cpu_vector_cost generic_vector_cost =
- {
-@@ -326,18 +380,36 @@ static const struct cpu_vector_cost generic_vector_cost =
-   1 /* cond_not_taken_branch_cost  */
- };
- 
-+/* ThunderX costs for vector insn classes.  */
-+static const struct cpu_vector_cost thunderx_vector_cost =
-+{
-+  1, /* scalar_stmt_cost  */
-+  3, /* scalar_load_cost  */
-+  1, /* scalar_store_cost  */
-+  4, /* vec_stmt_cost  */
-+  4, /* vec_permute_cost  */
-+  2, /* vec_to_scalar_cost  */
-+  2, /* scalar_to_vec_cost  */
-+  3, /* vec_align_load_cost  */
-+  10, /* vec_unalign_load_cost  */
-+  10, /* vec_unalign_store_cost  */
-+  1, /* vec_store_cost  */
-+  3, /* cond_taken_branch_cost  */
-+  3 /* cond_not_taken_branch_cost  */
-+};
-+
- /* Generic costs for vector insn classes.  */
- static const struct cpu_vector_cost cortexa57_vector_cost =
- {
-   1, /* scalar_stmt_cost  */
-   4, /* scalar_load_cost  */
-   1, /* scalar_store_cost  */
--  3, /* vec_stmt_cost  */
-+  2, /* vec_stmt_cost  */
-   3, /* vec_permute_cost  */
-   8, /* vec_to_scalar_cost  */
-   8, /* scalar_to_vec_cost  */
--  5, /* vec_align_load_cost  */
--  5, /* vec_unalign_load_cost  */
-+  4, /* vec_align_load_cost  */
-+  4, /* vec_unalign_load_cost  */
-   1, /* vec_unalign_store_cost  */
-   1, /* vec_store_cost  */
-   1, /* cond_taken_branch_cost  */
-@@ -379,6 +451,24 @@ static const struct cpu_vector_cost xgene1_vector_cost =
-   1 /* cond_not_taken_branch_cost  */
- };
- 
-+/* Costs for vector insn classes for Vulcan.  */
-+static const struct cpu_vector_cost vulcan_vector_cost =
-+{
-+  6, /* scalar_stmt_cost  */
-+  4, /* scalar_load_cost  */
-+  1, /* scalar_store_cost  */
-+  6, /* vec_stmt_cost  */
-+  3, /* vec_permute_cost  */
-+  6, /* vec_to_scalar_cost  */
-+  5, /* scalar_to_vec_cost  */
-+  8, /* vec_align_load_cost  */
-+  8, /* vec_unalign_load_cost  */
-+  4, /* vec_unalign_store_cost  */
-+  4, /* vec_store_cost  */
-+  2, /* cond_taken_branch_cost  */
-+  1  /* cond_not_taken_branch_cost  */
-+};
-+
- /* Generic costs for branch instructions.  */
- static const struct cpu_branch_cost generic_branch_cost =
- {
-@@ -393,6 +483,37 @@ static const struct cpu_branch_cost cortexa57_branch_cost =
-   3   /* Unpredictable.  */
- };
- 
-+/* Branch costs for Vulcan.  */
-+static const struct cpu_branch_cost vulcan_branch_cost =
-+{
-+  1,  /* Predictable.  */
-+  3   /* Unpredictable.  */
-+};
-+
-+/* Generic approximation modes.  */
-+static const cpu_approx_modes generic_approx_modes =
-+{
-+  AARCH64_APPROX_NONE,	/* division  */
-+  AARCH64_APPROX_NONE,	/* sqrt  */
-+  AARCH64_APPROX_NONE	/* recip_sqrt  */
-+};
-+
-+/* Approximation modes for Exynos M1.  */
-+static const cpu_approx_modes exynosm1_approx_modes =
-+{
-+  AARCH64_APPROX_NONE,	/* division  */
-+  AARCH64_APPROX_ALL,	/* sqrt  */
-+  AARCH64_APPROX_ALL	/* recip_sqrt  */
-+};
-+
-+/* Approximation modes for X-Gene 1.  */
-+static const cpu_approx_modes xgene1_approx_modes =
-+{
-+  AARCH64_APPROX_NONE,	/* division  */
-+  AARCH64_APPROX_NONE,	/* sqrt  */
-+  AARCH64_APPROX_ALL	/* recip_sqrt  */
-+};
-+
- static const struct tune_params generic_tunings =
- {
-   &cortexa57_extra_costs,
-@@ -400,6 +521,7 @@ static const struct tune_params generic_tunings =
-   &generic_regmove_cost,
-   &generic_vector_cost,
-   &generic_branch_cost,
-+  &generic_approx_modes,
-   4, /* memmov_cost  */
-   2, /* issue_rate  */
-   AARCH64_FUSE_NOTHING, /* fusible_ops  */
-@@ -423,14 +545,15 @@ static const struct tune_params cortexa35_tunings =
-   &generic_addrcost_table,
-   &cortexa53_regmove_cost,
-   &generic_vector_cost,
--  &generic_branch_cost,
-+  &cortexa57_branch_cost,
-+  &generic_approx_modes,
-   4, /* memmov_cost  */
-   1, /* issue_rate  */
--  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
--  8,	/* function_align.  */
-+  16,	/* function_align.  */
-   8,	/* jump_align.  */
--  4,	/* loop_align.  */
-+  8,	/* loop_align.  */
-   2,	/* int_reassoc_width.  */
-   4,	/* fp_reassoc_width.  */
-   1,	/* vec_reassoc_width.  */
-@@ -448,14 +571,15 @@ static const struct tune_params cortexa53_tunings =
-   &generic_addrcost_table,
-   &cortexa53_regmove_cost,
-   &generic_vector_cost,
--  &generic_branch_cost,
-+  &cortexa57_branch_cost,
-+  &generic_approx_modes,
-   4, /* memmov_cost  */
-   2, /* issue_rate  */
-   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
--  8,	/* function_align.  */
-+  16,	/* function_align.  */
-   8,	/* jump_align.  */
--  4,	/* loop_align.  */
-+  8,	/* loop_align.  */
-   2,	/* int_reassoc_width.  */
-   4,	/* fp_reassoc_width.  */
-   1,	/* vec_reassoc_width.  */
-@@ -474,13 +598,14 @@ static const struct tune_params cortexa57_tunings =
-   &cortexa57_regmove_cost,
-   &cortexa57_vector_cost,
-   &cortexa57_branch_cost,
-+  &generic_approx_modes,
-   4, /* memmov_cost  */
-   3, /* issue_rate  */
-   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
-   16,	/* function_align.  */
-   8,	/* jump_align.  */
--  4,	/* loop_align.  */
-+  8,	/* loop_align.  */
-   2,	/* int_reassoc_width.  */
-   4,	/* fp_reassoc_width.  */
-   1,	/* vec_reassoc_width.  */
-@@ -498,14 +623,15 @@ static const struct tune_params cortexa72_tunings =
-   &cortexa57_addrcost_table,
-   &cortexa57_regmove_cost,
-   &cortexa57_vector_cost,
--  &generic_branch_cost,
-+  &cortexa57_branch_cost,
-+  &generic_approx_modes,
-   4, /* memmov_cost  */
-   3, /* issue_rate  */
-   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
-   16,	/* function_align.  */
-   8,	/* jump_align.  */
--  4,	/* loop_align.  */
-+  8,	/* loop_align.  */
-   2,	/* int_reassoc_width.  */
-   4,	/* fp_reassoc_width.  */
-   1,	/* vec_reassoc_width.  */
-@@ -513,7 +639,33 @@ static const struct tune_params cortexa72_tunings =
-   2,	/* min_div_recip_mul_df.  */
-   0,	/* max_case_values.  */
-   0,	/* cache_line_size.  */
--  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-+  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
-+};
-+
-+static const struct tune_params cortexa73_tunings =
-+{
-+  &cortexa57_extra_costs,
-+  &cortexa57_addrcost_table,
-+  &cortexa57_regmove_cost,
-+  &cortexa57_vector_cost,
-+  &cortexa57_branch_cost,
-+  &generic_approx_modes,
-+  4, /* memmov_cost.  */
-+  2, /* issue_rate.  */
-+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-+   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
-+  16,	/* function_align.  */
-+  8,	/* jump_align.  */
-+  8,	/* loop_align.  */
-+  2,	/* int_reassoc_width.  */
-+  4,	/* fp_reassoc_width.  */
-+  1,	/* vec_reassoc_width.  */
-+  2,	/* min_div_recip_mul_sf.  */
-+  2,	/* min_div_recip_mul_df.  */
-+  0,	/* max_case_values.  */
-+  0,	/* cache_line_size.  */
-+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
- };
- 
-@@ -524,6 +676,7 @@ static const struct tune_params exynosm1_tunings =
-   &exynosm1_regmove_cost,
-   &exynosm1_vector_cost,
-   &generic_branch_cost,
-+  &exynosm1_approx_modes,
-   4,	/* memmov_cost  */
-   3,	/* issue_rate  */
-   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
-@@ -538,7 +691,7 @@ static const struct tune_params exynosm1_tunings =
-   48,	/* max_case_values.  */
-   64,	/* cache_line_size.  */
-   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
--  (AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags.  */
-+  (AARCH64_EXTRA_TUNE_NONE) /* tune_flags.  */
- };
- 
- static const struct tune_params thunderx_tunings =
-@@ -546,8 +699,9 @@ static const struct tune_params thunderx_tunings =
-   &thunderx_extra_costs,
-   &generic_addrcost_table,
-   &thunderx_regmove_cost,
--  &generic_vector_cost,
-+  &thunderx_vector_cost,
-   &generic_branch_cost,
-+  &generic_approx_modes,
-   6, /* memmov_cost  */
-   2, /* issue_rate  */
-   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
-@@ -562,7 +716,7 @@ static const struct tune_params thunderx_tunings =
-   0,	/* max_case_values.  */
-   0,	/* cache_line_size.  */
-   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
--  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
-+  (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)	/* tune_flags.  */
- };
- 
- static const struct tune_params xgene1_tunings =
-@@ -572,6 +726,7 @@ static const struct tune_params xgene1_tunings =
-   &xgene1_regmove_cost,
-   &xgene1_vector_cost,
-   &generic_branch_cost,
-+  &xgene1_approx_modes,
-   6, /* memmov_cost  */
-   4, /* issue_rate  */
-   AARCH64_FUSE_NOTHING, /* fusible_ops  */
-@@ -586,7 +741,58 @@ static const struct tune_params xgene1_tunings =
-   0,	/* max_case_values.  */
-   0,	/* cache_line_size.  */
-   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
--  (AARCH64_EXTRA_TUNE_APPROX_RSQRT)	/* tune_flags.  */
-+  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
-+};
-+
-+static const struct tune_params qdf24xx_tunings =
-+{
-+  &qdf24xx_extra_costs,
-+  &qdf24xx_addrcost_table,
-+  &qdf24xx_regmove_cost,
-+  &generic_vector_cost,
-+  &generic_branch_cost,
-+  &generic_approx_modes,
-+  4, /* memmov_cost  */
-+  4, /* issue_rate  */
-+  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-+   | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
-+  16,	/* function_align.  */
-+  8,	/* jump_align.  */
-+  16,	/* loop_align.  */
-+  2,	/* int_reassoc_width.  */
-+  4,	/* fp_reassoc_width.  */
-+  1,	/* vec_reassoc_width.  */
-+  2,	/* min_div_recip_mul_sf.  */
-+  2,	/* min_div_recip_mul_df.  */
-+  0,	/* max_case_values.  */
-+  64,	/* cache_line_size.  */
-+  tune_params::AUTOPREFETCHER_STRONG,	/* autoprefetcher_model.  */
-+  (AARCH64_EXTRA_TUNE_NONE)		/* tune_flags.  */
-+};
-+
-+static const struct tune_params vulcan_tunings =
-+{
-+  &vulcan_extra_costs,
-+  &vulcan_addrcost_table,
-+  &vulcan_regmove_cost,
-+  &vulcan_vector_cost,
-+  &vulcan_branch_cost,
-+  &generic_approx_modes,
-+  4, /* memmov_cost.  */
-+  4, /* issue_rate.  */
-+  AARCH64_FUSE_NOTHING, /* fuseable_ops.  */
-+  16,	/* function_align.  */
-+  8,	/* jump_align.  */
-+  16,	/* loop_align.  */
-+  3,	/* int_reassoc_width.  */
-+  2,	/* fp_reassoc_width.  */
-+  2,	/* vec_reassoc_width.  */
-+  2,	/* min_div_recip_mul_sf.  */
-+  2,	/* min_div_recip_mul_df.  */
-+  0,	/* max_case_values.  */
-+  64,	/* cache_line_size.  */
-+  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-+  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
- };
- 
- /* Support for fine-grained override of the tuning structures.  */
-@@ -663,16 +869,6 @@ struct aarch64_option_extension
-   const unsigned long flags_off;
- };
- 
--/* ISA extensions in AArch64.  */
--static const struct aarch64_option_extension all_extensions[] =
--{
--#define AARCH64_OPT_EXTENSION(NAME, X, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
--  {NAME, FLAGS_ON, FLAGS_OFF},
--#include "aarch64-option-extensions.def"
--#undef AARCH64_OPT_EXTENSION
--  {NULL, 0, 0}
--};
--
- typedef enum aarch64_cond_code
- {
-   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
-@@ -1110,7 +1306,8 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm,
- 	    emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
- 
- 	    if (mode != GET_MODE (gp_rtx))
--	      gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
-+             gp_rtx = gen_lowpart (mode, gp_rtx);
-+
- 	  }
- 
- 	if (mode == ptr_mode)
-@@ -1186,10 +1383,14 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm,
-     case SYMBOL_SMALL_TLSGD:
-       {
- 	rtx_insn *insns;
--	rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
-+	machine_mode mode = GET_MODE (dest);
-+	rtx result = gen_rtx_REG (mode, R0_REGNUM);
- 
- 	start_sequence ();
--	aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
-+	if (TARGET_ILP32)
-+	  aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
-+	else
-+	  aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
- 	insns = get_insns ();
- 	end_sequence ();
- 
-@@ -1703,7 +1904,7 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
- 	     we need to expand the literal pool access carefully.
- 	     This is something that needs to be done in a number
- 	     of places, so could well live as a separate function.  */
--	  if (aarch64_nopcrelative_literal_loads)
-+	  if (!aarch64_pcrelative_literal_loads)
- 	    {
- 	      gcc_assert (can_create_pseudo_p ());
- 	      base = gen_reg_rtx (ptr_mode);
-@@ -1766,6 +1967,88 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
-   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
- }
- 
-+/* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
-+   temporary value if necessary.  FRAME_RELATED_P should be true if
-+   the RTX_FRAME_RELATED flag should be set and CFA adjustments added
-+   to the generated instructions.  If SCRATCHREG is known to hold
-+   abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
-+   immediate again.
-+
-+   Since this function may be used to adjust the stack pointer, we must
-+   ensure that it cannot cause transient stack deallocation (for example
-+   by first incrementing SP and then decrementing when adjusting by a
-+   large immediate).  */
-+
-+static void
-+aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
-+			       HOST_WIDE_INT delta, bool frame_related_p,
-+			       bool emit_move_imm)
-+{
-+  HOST_WIDE_INT mdelta = abs_hwi (delta);
-+  rtx this_rtx = gen_rtx_REG (mode, regnum);
-+  rtx_insn *insn;
-+
-+  if (!mdelta)
-+    return;
-+
-+  /* Single instruction adjustment.  */
-+  if (aarch64_uimm12_shift (mdelta))
-+    {
-+      insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
-+      RTX_FRAME_RELATED_P (insn) = frame_related_p;
-+      return;
-+    }
-+
-+  /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
-+     Only do this if mdelta is not a 16-bit move as adjusting using a move
-+     is better.  */
-+  if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
-+    {
-+      HOST_WIDE_INT low_off = mdelta & 0xfff;
-+
-+      low_off = delta < 0 ? -low_off : low_off;
-+      insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
-+      RTX_FRAME_RELATED_P (insn) = frame_related_p;
-+      insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
-+      RTX_FRAME_RELATED_P (insn) = frame_related_p;
-+      return;
-+    }
-+
-+  /* Emit a move immediate if required and an addition/subtraction.  */
-+  rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
-+  if (emit_move_imm)
-+    aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
-+  insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
-+			      : gen_add2_insn (this_rtx, scratch_rtx));
-+  if (frame_related_p)
-+    {
-+      RTX_FRAME_RELATED_P (insn) = frame_related_p;
-+      rtx adj = plus_constant (mode, this_rtx, delta);
-+      add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
-+    }
-+}
-+
-+static inline void
-+aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
-+		      HOST_WIDE_INT delta)
-+{
-+  aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
-+}
-+
-+static inline void
-+aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
-+{
-+  aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
-+				 true, emit_move_imm);
-+}
-+
-+static inline void
-+aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
-+{
-+  aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
-+				 frame_related_p, true);
-+}
-+
- static bool
- aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
- 				 tree exp ATTRIBUTE_UNUSED)
-@@ -2494,7 +2777,7 @@ static void
- aarch64_layout_frame (void)
- {
-   HOST_WIDE_INT offset = 0;
--  int regno;
-+  int regno, last_fp_reg = INVALID_REGNUM;
- 
-   if (reload_completed && cfun->machine->frame.laid_out)
-     return;
-@@ -2502,8 +2785,8 @@ aarch64_layout_frame (void)
- #define SLOT_NOT_REQUIRED (-2)
- #define SLOT_REQUIRED     (-1)
- 
--  cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
--  cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
-+  cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
-+  cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
- 
-   /* First mark all the registers that really need to be saved...  */
-   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
-@@ -2528,7 +2811,10 @@ aarch64_layout_frame (void)
-   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
-     if (df_regs_ever_live_p (regno)
- 	&& !call_used_regs[regno])
--      cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
-+      {
-+	cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
-+	last_fp_reg = regno;
-+      }
- 
-   if (frame_pointer_needed)
-     {
-@@ -2537,7 +2823,6 @@ aarch64_layout_frame (void)
-       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
-       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
-       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
--      cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
-       offset += 2 * UNITS_PER_WORD;
-     }
- 
-@@ -2546,35 +2831,46 @@ aarch64_layout_frame (void)
-     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
-       {
- 	cfun->machine->frame.reg_offset[regno] = offset;
--	if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
-+	if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
- 	  cfun->machine->frame.wb_candidate1 = regno;
--	else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
-+	else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
- 	  cfun->machine->frame.wb_candidate2 = regno;
- 	offset += UNITS_PER_WORD;
-       }
- 
-+  HOST_WIDE_INT max_int_offset = offset;
-+  offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
-+  bool has_align_gap = offset != max_int_offset;
-+
-   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
-     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
-       {
-+	/* If there is an alignment gap between integer and fp callee-saves,
-+	   allocate the last fp register to it if possible.  */
-+	if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
-+	  {
-+	    cfun->machine->frame.reg_offset[regno] = max_int_offset;
-+	    break;
-+	  }
-+
- 	cfun->machine->frame.reg_offset[regno] = offset;
--	if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
-+	if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
- 	  cfun->machine->frame.wb_candidate1 = regno;
--	else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
-+	else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
- 		 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
- 	  cfun->machine->frame.wb_candidate2 = regno;
- 	offset += UNITS_PER_WORD;
-       }
- 
--  cfun->machine->frame.padding0 =
--    (ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
-   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
- 
-   cfun->machine->frame.saved_regs_size = offset;
- 
-+  HOST_WIDE_INT varargs_and_saved_regs_size
-+    = offset + cfun->machine->frame.saved_varargs_size;
-+
-   cfun->machine->frame.hard_fp_offset
--    = ROUND_UP (cfun->machine->frame.saved_varargs_size
--		+ get_frame_size ()
--		+ cfun->machine->frame.saved_regs_size,
-+    = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
- 		STACK_BOUNDARY / BITS_PER_UNIT);
- 
-   cfun->machine->frame.frame_size
-@@ -2582,15 +2878,92 @@ aarch64_layout_frame (void)
- 		+ crtl->outgoing_args_size,
- 		STACK_BOUNDARY / BITS_PER_UNIT);
- 
-+  cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
-+
-+  cfun->machine->frame.initial_adjust = 0;
-+  cfun->machine->frame.final_adjust = 0;
-+  cfun->machine->frame.callee_adjust = 0;
-+  cfun->machine->frame.callee_offset = 0;
-+
-+  HOST_WIDE_INT max_push_offset = 0;
-+  if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
-+    max_push_offset = 512;
-+  else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
-+    max_push_offset = 256;
-+
-+  if (cfun->machine->frame.frame_size < max_push_offset
-+      && crtl->outgoing_args_size == 0)
-+    {
-+      /* Simple, small frame with no outgoing arguments:
-+	 stp reg1, reg2, [sp, -frame_size]!
-+	 stp reg3, reg4, [sp, 16]  */
-+      cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
-+    }
-+  else if ((crtl->outgoing_args_size
-+	    + cfun->machine->frame.saved_regs_size < 512)
-+	   && !(cfun->calls_alloca
-+		&& cfun->machine->frame.hard_fp_offset < max_push_offset))
-+    {
-+      /* Frame with small outgoing arguments:
-+	 sub sp, sp, frame_size
-+	 stp reg1, reg2, [sp, outgoing_args_size]
-+	 stp reg3, reg4, [sp, outgoing_args_size + 16]  */
-+      cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
-+      cfun->machine->frame.callee_offset
-+	= cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
-+    }
-+  else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
-+    {
-+      /* Frame with large outgoing arguments but a small local area:
-+	 stp reg1, reg2, [sp, -hard_fp_offset]!
-+	 stp reg3, reg4, [sp, 16]
-+	 sub sp, sp, outgoing_args_size  */
-+      cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
-+      cfun->machine->frame.final_adjust
-+	= cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
-+    }
-+  else if (!frame_pointer_needed
-+	   && varargs_and_saved_regs_size < max_push_offset)
-+    {
-+      /* Frame with large local area and outgoing arguments (this pushes the
-+	 callee-saves first, followed by the locals and outgoing area):
-+	 stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
-+	 stp reg3, reg4, [sp, 16]
-+	 sub sp, sp, frame_size - varargs_and_saved_regs_size  */
-+      cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
-+      cfun->machine->frame.final_adjust
-+	= cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
-+      cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
-+      cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
-+    }
-+  else
-+    {
-+      /* Frame with large local area and outgoing arguments using frame pointer:
-+	 sub sp, sp, hard_fp_offset
-+	 stp x29, x30, [sp, 0]
-+	 add x29, sp, 0
-+	 stp reg3, reg4, [sp, 16]
-+	 sub sp, sp, outgoing_args_size  */
-+      cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
-+      cfun->machine->frame.final_adjust
-+	= cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
-+    }
-+
-   cfun->machine->frame.laid_out = true;
- }
- 
-+/* Return true if the register REGNO is saved on entry to
-+   the current function.  */
-+
- static bool
- aarch64_register_saved_on_entry (int regno)
- {
-   return cfun->machine->frame.reg_offset[regno] >= 0;
- }
- 
-+/* Return the next register up from REGNO up to LIMIT for the callee
-+   to save.  */
-+
- static unsigned
- aarch64_next_callee_save (unsigned regno, unsigned limit)
- {
-@@ -2599,6 +2972,9 @@ aarch64_next_callee_save (unsigned regno, unsigned limit)
-   return regno;
- }
- 
-+/* Push the register number REGNO of mode MODE to the stack with write-back
-+   adjusting the stack by ADJUSTMENT.  */
-+
- static void
- aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
- 			   HOST_WIDE_INT adjustment)
-@@ -2615,6 +2991,10 @@ aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
-   RTX_FRAME_RELATED_P (insn) = 1;
- }
- 
-+/* Generate and return an instruction to store the pair of registers
-+   REG and REG2 of mode MODE to location BASE with write-back adjusting
-+   the stack location BASE by ADJUSTMENT.  */
-+
- static rtx
- aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
- 			  HOST_WIDE_INT adjustment)
-@@ -2634,11 +3014,18 @@ aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
-     }
- }
- 
-+/* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
-+   stack pointer by ADJUSTMENT.  */
-+
- static void
--aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
--			 unsigned regno2, HOST_WIDE_INT adjustment)
-+aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
- {
-   rtx_insn *insn;
-+  machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
-+
-+  if (regno2 == INVALID_REGNUM)
-+    return aarch64_pushwb_single_reg (mode, regno1, adjustment);
-+
-   rtx reg1 = gen_rtx_REG (mode, regno1);
-   rtx reg2 = gen_rtx_REG (mode, regno2);
- 
-@@ -2649,6 +3036,9 @@ aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
-   RTX_FRAME_RELATED_P (insn) = 1;
- }
- 
-+/* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
-+   adjusting it by ADJUSTMENT afterwards.  */
-+
- static rtx
- aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
- 			 HOST_WIDE_INT adjustment)
-@@ -2666,6 +3056,37 @@ aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
-     }
- }
- 
-+/* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
-+   afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
-+   into CFI_OPS.  */
-+
-+static void
-+aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
-+		  rtx *cfi_ops)
-+{
-+  machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
-+  rtx reg1 = gen_rtx_REG (mode, regno1);
-+
-+  *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
-+
-+  if (regno2 == INVALID_REGNUM)
-+    {
-+      rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
-+      mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
-+      emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
-+    }
-+  else
-+    {
-+      rtx reg2 = gen_rtx_REG (mode, regno2);
-+      *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
-+      emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
-+					  reg2, adjustment));
-+    }
-+}
-+
-+/* Generate and return a store pair instruction of mode MODE to store
-+   register REG1 to MEM1 and register REG2 to MEM2.  */
-+
- static rtx
- aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
- 			rtx reg2)
-@@ -2683,6 +3104,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
-     }
- }
- 
-+/* Generate and regurn a load pair isntruction of mode MODE to load register
-+   REG1 from MEM1 and register REG2 from MEM2.  */
-+
- static rtx
- aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
- 		       rtx mem2)
-@@ -2700,6 +3124,9 @@ aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
-     }
- }
- 
-+/* Emit code to save the callee-saved registers from register number START
-+   to LIMIT to the stack at the location starting at offset START_OFFSET,
-+   skipping any write-back candidates if SKIP_WB is true.  */
- 
- static void
- aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
-@@ -2758,6 +3185,11 @@ aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
-     }
- }
- 
-+/* Emit code to restore the callee registers of mode MODE from register
-+   number START up to and including LIMIT.  Restore from the stack offset
-+   START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
-+   Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
-+
- static void
- aarch64_restore_callee_saves (machine_mode mode,
- 			      HOST_WIDE_INT start_offset, unsigned start,
-@@ -2852,23 +3284,16 @@ aarch64_restore_callee_saves (machine_mode mode,
- void
- aarch64_expand_prologue (void)
- {
--  /* sub sp, sp, #<frame_size>
--     stp {fp, lr}, [sp, #<frame_size> - 16]
--     add fp, sp, #<frame_size> - hardfp_offset
--     stp {cs_reg}, [fp, #-16] etc.
--
--     sub sp, sp, <final_adjustment_if_any>
--  */
--  HOST_WIDE_INT frame_size, offset;
--  HOST_WIDE_INT fp_offset;		/* Offset from hard FP to SP.  */
--  HOST_WIDE_INT hard_fp_offset;
--  rtx_insn *insn;
--
-   aarch64_layout_frame ();
- 
--  offset = frame_size = cfun->machine->frame.frame_size;
--  hard_fp_offset = cfun->machine->frame.hard_fp_offset;
--  fp_offset = frame_size - hard_fp_offset;
-+  HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
-+  HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
-+  HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
-+  HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
-+  HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
-+  unsigned reg1 = cfun->machine->frame.wb_candidate1;
-+  unsigned reg2 = cfun->machine->frame.wb_candidate2;
-+  rtx_insn *insn;
- 
-   if (flag_stack_usage_info)
-     current_function_static_stack_size = frame_size;
-@@ -2885,129 +3310,28 @@ aarch64_expand_prologue (void)
- 	aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
-     }
- 
--  /* Store pairs and load pairs have a range only -512 to 504.  */
--  if (offset >= 512)
--    {
--      /* When the frame has a large size, an initial decrease is done on
--	 the stack pointer to jump over the callee-allocated save area for
--	 register varargs, the local variable area and/or the callee-saved
--	 register area.  This will allow the pre-index write-back
--	 store pair instructions to be used for setting up the stack frame
--	 efficiently.  */
--      offset = hard_fp_offset;
--      if (offset >= 512)
--	offset = cfun->machine->frame.saved_regs_size;
-+  aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
- 
--      frame_size -= (offset + crtl->outgoing_args_size);
--      fp_offset = 0;
-+  if (callee_adjust != 0)
-+    aarch64_push_regs (reg1, reg2, callee_adjust);
- 
--      if (frame_size >= 0x1000000)
--	{
--	  rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
--	  emit_move_insn (op0, GEN_INT (-frame_size));
--	  insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
--
--	  add_reg_note (insn, REG_CFA_ADJUST_CFA,
--			gen_rtx_SET (stack_pointer_rtx,
--				     plus_constant (Pmode, stack_pointer_rtx,
--						    -frame_size)));
--	  RTX_FRAME_RELATED_P (insn) = 1;
--	}
--      else if (frame_size > 0)
--	{
--	  int hi_ofs = frame_size & 0xfff000;
--	  int lo_ofs = frame_size & 0x000fff;
--
--	  if (hi_ofs)
--	    {
--	      insn = emit_insn (gen_add2_insn
--				(stack_pointer_rtx, GEN_INT (-hi_ofs)));
--	      RTX_FRAME_RELATED_P (insn) = 1;
--	    }
--	  if (lo_ofs)
--	    {
--	      insn = emit_insn (gen_add2_insn
--				(stack_pointer_rtx, GEN_INT (-lo_ofs)));
--	      RTX_FRAME_RELATED_P (insn) = 1;
--	    }
--	}
--    }
--  else
--    frame_size = -1;
--
--  if (offset > 0)
-+  if (frame_pointer_needed)
-     {
--      bool skip_wb = false;
--
--      if (frame_pointer_needed)
--	{
--	  skip_wb = true;
--
--	  if (fp_offset)
--	    {
--	      insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
--					       GEN_INT (-offset)));
--	      RTX_FRAME_RELATED_P (insn) = 1;
--
--	      aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
--					 R30_REGNUM, false);
--	    }
--	  else
--	    aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
--
--	  /* Set up frame pointer to point to the location of the
--	     previous frame pointer on the stack.  */
--	  insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
--					   stack_pointer_rtx,
--					   GEN_INT (fp_offset)));
--	  RTX_FRAME_RELATED_P (insn) = 1;
--	  emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
--	}
--      else
--	{
--	  unsigned reg1 = cfun->machine->frame.wb_candidate1;
--	  unsigned reg2 = cfun->machine->frame.wb_candidate2;
--
--	  if (fp_offset
--	      || reg1 == FIRST_PSEUDO_REGISTER
--	      || (reg2 == FIRST_PSEUDO_REGISTER
--		  && offset >= 256))
--	    {
--	      insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
--					       GEN_INT (-offset)));
--	      RTX_FRAME_RELATED_P (insn) = 1;
--	    }
--	  else
--	    {
--	      machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
--
--	      skip_wb = true;
--
--	      if (reg2 == FIRST_PSEUDO_REGISTER)
--		aarch64_pushwb_single_reg (mode1, reg1, offset);
--	      else
--		aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
--	    }
--	}
--
--      aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
--				 skip_wb);
--      aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
--				 skip_wb);
-+      if (callee_adjust == 0)
-+	aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
-+				   R30_REGNUM, false);
-+      insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
-+				       stack_pointer_rtx,
-+				       GEN_INT (callee_offset)));
-+      RTX_FRAME_RELATED_P (insn) = 1;
-+      emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
-     }
- 
--  /* when offset >= 512,
--     sub sp, sp, #<outgoing_args_size> */
--  if (frame_size > -1)
--    {
--      if (crtl->outgoing_args_size > 0)
--	{
--	  insn = emit_insn (gen_add2_insn
--			    (stack_pointer_rtx,
--			     GEN_INT (- crtl->outgoing_args_size)));
--	  RTX_FRAME_RELATED_P (insn) = 1;
--	}
--    }
-+  aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
-+			     callee_adjust != 0 || frame_pointer_needed);
-+  aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
-+			     callee_adjust != 0 || frame_pointer_needed);
-+  aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
- }
- 
- /* Return TRUE if we can use a simple_return insn.
-@@ -3030,151 +3354,80 @@ aarch64_use_return_insn_p (void)
-   return cfun->machine->frame.frame_size == 0;
- }
- 
--/* Generate the epilogue instructions for returning from a function.  */
-+/* Generate the epilogue instructions for returning from a function.
-+   This is almost exactly the reverse of the prolog sequence, except
-+   that we need to insert barriers to avoid scheduling loads that read
-+   from a deallocated stack, and we optimize the unwind records by
-+   emitting them all together if possible.  */
- void
- aarch64_expand_epilogue (bool for_sibcall)
- {
--  HOST_WIDE_INT frame_size, offset;
--  HOST_WIDE_INT fp_offset;
--  HOST_WIDE_INT hard_fp_offset;
--  rtx_insn *insn;
--  /* We need to add memory barrier to prevent read from deallocated stack.  */
--  bool need_barrier_p = (get_frame_size () != 0
--			 || cfun->machine->frame.saved_varargs_size
--			 || crtl->calls_eh_return);
--
-   aarch64_layout_frame ();
- 
--  offset = frame_size = cfun->machine->frame.frame_size;
--  hard_fp_offset = cfun->machine->frame.hard_fp_offset;
--  fp_offset = frame_size - hard_fp_offset;
-+  HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
-+  HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
-+  HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
-+  HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
-+  unsigned reg1 = cfun->machine->frame.wb_candidate1;
-+  unsigned reg2 = cfun->machine->frame.wb_candidate2;
-+  rtx cfi_ops = NULL;
-+  rtx_insn *insn;
- 
--  /* Store pairs and load pairs have a range only -512 to 504.  */
--  if (offset >= 512)
--    {
--      offset = hard_fp_offset;
--      if (offset >= 512)
--	offset = cfun->machine->frame.saved_regs_size;
-+  /* We need to add memory barrier to prevent read from deallocated stack.  */
-+  bool need_barrier_p = (get_frame_size ()
-+			 + cfun->machine->frame.saved_varargs_size) != 0;
- 
--      frame_size -= (offset + crtl->outgoing_args_size);
--      fp_offset = 0;
--      if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
--	{
--	  insn = emit_insn (gen_add2_insn
--			    (stack_pointer_rtx,
--			     GEN_INT (crtl->outgoing_args_size)));
--	  RTX_FRAME_RELATED_P (insn) = 1;
--	}
-+  /* Emit a barrier to prevent loads from a deallocated stack.  */
-+  if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
-+      || crtl->calls_eh_return)
-+    {
-+      emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
-+      need_barrier_p = false;
-     }
--  else
--    frame_size = -1;
- 
--  /* If there were outgoing arguments or we've done dynamic stack
--     allocation, then restore the stack pointer from the frame
--     pointer.  This is at most one insn and more efficient than using
--     GCC's internal mechanism.  */
--  if (frame_pointer_needed
--      && (crtl->outgoing_args_size || cfun->calls_alloca))
-+  /* Restore the stack pointer from the frame pointer if it may not
-+     be the same as the stack pointer.  */
-+  if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
-     {
--      if (cfun->calls_alloca)
--	emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
--
-       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
- 				       hard_frame_pointer_rtx,
--				       GEN_INT (0)));
--      offset = offset - fp_offset;
--    }
--
--  if (offset > 0)
--    {
--      unsigned reg1 = cfun->machine->frame.wb_candidate1;
--      unsigned reg2 = cfun->machine->frame.wb_candidate2;
--      bool skip_wb = true;
--      rtx cfi_ops = NULL;
--
--      if (frame_pointer_needed)
--	fp_offset = 0;
--      else if (fp_offset
--	       || reg1 == FIRST_PSEUDO_REGISTER
--	       || (reg2 == FIRST_PSEUDO_REGISTER
--		   && offset >= 256))
--	skip_wb = false;
--
--      aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
--				    skip_wb, &cfi_ops);
--      aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
--				    skip_wb, &cfi_ops);
--
--      if (need_barrier_p)
--	emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
--
--      if (skip_wb)
--	{
--	  machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
--	  rtx rreg1 = gen_rtx_REG (mode1, reg1);
--
--	  cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
--	  if (reg2 == FIRST_PSEUDO_REGISTER)
--	    {
--	      rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
--	      mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
--	      mem = gen_rtx_MEM (mode1, mem);
--	      insn = emit_move_insn (rreg1, mem);
--	    }
--	  else
--	    {
--	      rtx rreg2 = gen_rtx_REG (mode1, reg2);
--
--	      cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
--	      insn = emit_insn (aarch64_gen_loadwb_pair
--				(mode1, stack_pointer_rtx, rreg1,
--				 rreg2, offset));
--	    }
--	}
--      else
--	{
--	  insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
--					   GEN_INT (offset)));
--	}
--
--      /* Reset the CFA to be SP + FRAME_SIZE.  */
--      rtx new_cfa = stack_pointer_rtx;
--      if (frame_size > 0)
--	new_cfa = plus_constant (Pmode, new_cfa, frame_size);
--      cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
--      REG_NOTES (insn) = cfi_ops;
--      RTX_FRAME_RELATED_P (insn) = 1;
-+				       GEN_INT (-callee_offset)));
-+      /* If writeback is used when restoring callee-saves, the CFA
-+	 is restored on the instruction doing the writeback.  */
-+      RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
-     }
-+  else
-+    aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
- 
--  if (frame_size > 0)
--    {
--      if (need_barrier_p)
--	emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
-+  aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
-+				callee_adjust != 0, &cfi_ops);
-+  aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
-+				callee_adjust != 0, &cfi_ops);
- 
--      if (frame_size >= 0x1000000)
--	{
--	  rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
--	  emit_move_insn (op0, GEN_INT (frame_size));
--	  insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
--	}
--      else
--	{
--          int hi_ofs = frame_size & 0xfff000;
--          int lo_ofs = frame_size & 0x000fff;
-+  if (need_barrier_p)
-+    emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
- 
--	  if (hi_ofs && lo_ofs)
--	    {
--	      insn = emit_insn (gen_add2_insn
--				(stack_pointer_rtx, GEN_INT (hi_ofs)));
--	      RTX_FRAME_RELATED_P (insn) = 1;
--	      frame_size = lo_ofs;
--	    }
--	  insn = emit_insn (gen_add2_insn
--			    (stack_pointer_rtx, GEN_INT (frame_size)));
--	}
-+  if (callee_adjust != 0)
-+    aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
-+
-+  if (callee_adjust != 0 || initial_adjust > 65536)
-+    {
-+      /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
-+      insn = get_last_insn ();
-+      rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
-+      REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
-+      RTX_FRAME_RELATED_P (insn) = 1;
-+      cfi_ops = NULL;
-+    }
-+
-+  aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
- 
--      /* Reset the CFA to be SP + 0.  */
--      add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
-+  if (cfi_ops)
-+    {
-+      /* Emit delayed restores and reset the CFA to be SP.  */
-+      insn = get_last_insn ();
-+      cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
-+      REG_NOTES (insn) = cfi_ops;
-       RTX_FRAME_RELATED_P (insn) = 1;
-     }
- 
-@@ -3230,122 +3483,6 @@ aarch64_eh_return_handler_rtx (void)
-   return tmp;
- }
- 
--/* Possibly output code to build up a constant in a register.  For
--   the benefit of the costs infrastructure, returns the number of
--   instructions which would be emitted.  GENERATE inhibits or
--   enables code generation.  */
--
--static int
--aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
--{
--  int insns = 0;
--
--  if (aarch64_bitmask_imm (val, DImode))
--    {
--      if (generate)
--	emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
--      insns = 1;
--    }
--  else
--    {
--      int i;
--      int ncount = 0;
--      int zcount = 0;
--      HOST_WIDE_INT valp = val >> 16;
--      HOST_WIDE_INT valm;
--      HOST_WIDE_INT tval;
--
--      for (i = 16; i < 64; i += 16)
--	{
--	  valm = (valp & 0xffff);
--
--	  if (valm != 0)
--	    ++ zcount;
--
--	  if (valm != 0xffff)
--	    ++ ncount;
--
--	  valp >>= 16;
--	}
--
--      /* zcount contains the number of additional MOVK instructions
--	 required if the constant is built up with an initial MOVZ instruction,
--	 while ncount is the number of MOVK instructions required if starting
--	 with a MOVN instruction.  Choose the sequence that yields the fewest
--	 number of instructions, preferring MOVZ instructions when they are both
--	 the same.  */
--      if (ncount < zcount)
--	{
--	  if (generate)
--	    emit_move_insn (gen_rtx_REG (Pmode, regnum),
--			    GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
--	  tval = 0xffff;
--	  insns++;
--	}
--      else
--	{
--	  if (generate)
--	    emit_move_insn (gen_rtx_REG (Pmode, regnum),
--			    GEN_INT (val & 0xffff));
--	  tval = 0;
--	  insns++;
--	}
--
--      val >>= 16;
--
--      for (i = 16; i < 64; i += 16)
--	{
--	  if ((val & 0xffff) != tval)
--	    {
--	      if (generate)
--		emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
--					   GEN_INT (i),
--					   GEN_INT (val & 0xffff)));
--	      insns++;
--	    }
--	  val >>= 16;
--	}
--    }
--  return insns;
--}
--
--static void
--aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
--{
--  HOST_WIDE_INT mdelta = delta;
--  rtx this_rtx = gen_rtx_REG (Pmode, regnum);
--  rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
--
--  if (mdelta < 0)
--    mdelta = -mdelta;
--
--  if (mdelta >= 4096 * 4096)
--    {
--      (void) aarch64_build_constant (scratchreg, delta, true);
--      emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
--    }
--  else if (mdelta > 0)
--    {
--      if (mdelta >= 4096)
--	{
--	  emit_insn (gen_rtx_SET (scratch_rtx, GEN_INT (mdelta / 4096)));
--	  rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
--	  if (delta < 0)
--	    emit_insn (gen_rtx_SET (this_rtx,
--				    gen_rtx_MINUS (Pmode, this_rtx, shift)));
--	  else
--	    emit_insn (gen_rtx_SET (this_rtx,
--				    gen_rtx_PLUS (Pmode, this_rtx, shift)));
--	}
--      if (mdelta % 4096 != 0)
--	{
--	  scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
--	  emit_insn (gen_rtx_SET (this_rtx,
--				  gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
--	}
--    }
--}
--
- /* Output code to add DELTA to the first argument, and then jump
-    to FUNCTION.  Used for C++ multiple inheritance.  */
- static void
-@@ -3366,7 +3503,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
-   emit_note (NOTE_INSN_PROLOGUE_END);
- 
-   if (vcall_offset == 0)
--    aarch64_add_constant (this_regno, IP1_REGNUM, delta);
-+    aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
-   else
-     {
-       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
-@@ -3382,7 +3519,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
- 	    addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
- 				       plus_constant (Pmode, this_rtx, delta));
- 	  else
--	    aarch64_add_constant (this_regno, IP1_REGNUM, delta);
-+	    aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
- 	}
- 
-       if (Pmode == ptr_mode)
-@@ -3396,7 +3533,8 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
- 	  addr = plus_constant (Pmode, temp0, vcall_offset);
-       else
- 	{
--	  (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
-+	  aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
-+					  Pmode);
- 	  addr = gen_rtx_PLUS (Pmode, temp0, temp1);
- 	}
- 
-@@ -3575,7 +3713,12 @@ aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
-   return aarch64_tls_referenced_p (x);
- }
- 
--/* Implement TARGET_CASE_VALUES_THRESHOLD.  */
-+/* Implement TARGET_CASE_VALUES_THRESHOLD.
-+   The expansion for a table switch is quite expensive due to the number
-+   of instructions, the table lookup and hard to predict indirect jump.
-+   When optimizing for speed, and -O3 enabled, use the per-core tuning if 
-+   set, otherwise use tables for > 16 cases as a tradeoff between size and
-+   performance.  When optimizing for size, use the default setting.  */
- 
- static unsigned int
- aarch64_case_values_threshold (void)
-@@ -3586,7 +3729,7 @@ aarch64_case_values_threshold (void)
-       && selected_cpu->tune->max_case_values != 0)
-     return selected_cpu->tune->max_case_values;
-   else
--    return default_case_values_threshold ();
-+    return optimize_size ? default_case_values_threshold () : 17;
- }
- 
- /* Return true if register REGNO is a valid index register.
-@@ -3921,9 +4064,11 @@ aarch64_classify_address (struct aarch64_address_info *info,
- 	     X,X: 7-bit signed scaled offset
- 	     Q:   9-bit signed offset
- 	     We conservatively require an offset representable in either mode.
--	   */
-+	     When performing the check for pairs of X registers i.e.  LDP/STP
-+	     pass down DImode since that is the natural size of the LDP/STP
-+	     instruction memory accesses.  */
- 	  if (mode == TImode || mode == TFmode)
--	    return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
-+	    return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
- 		    && offset_9bit_signed_unscaled_p (mode, offset));
- 
- 	  /* A 7bit offset check because OImode will emit a ldp/stp
-@@ -4031,7 +4176,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
- 	  return ((GET_CODE (sym) == LABEL_REF
- 		   || (GET_CODE (sym) == SYMBOL_REF
- 		       && CONSTANT_POOL_ADDRESS_P (sym)
--		       && !aarch64_nopcrelative_literal_loads)));
-+		       && aarch64_pcrelative_literal_loads)));
- 	}
-       return false;
- 
-@@ -4125,6 +4270,24 @@ aarch64_legitimate_address_p (machine_mode mode, rtx x,
-   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
- }
- 
-+/* Split an out-of-range address displacement into a base and offset.
-+   Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
-+   to increase opportunities for sharing the base address of different sizes.
-+   For TI/TFmode and unaligned accesses use a 256-byte range.  */
-+static bool
-+aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
-+{
-+  HOST_WIDE_INT mask = GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3fff;
-+
-+  if (mode == TImode || mode == TFmode ||
-+      (INTVAL (*disp) & (GET_MODE_SIZE (mode) - 1)) != 0)
-+    mask = 0xff;
-+
-+  *off = GEN_INT (INTVAL (*disp) & ~mask);
-+  *disp = GEN_INT (INTVAL (*disp) & mask);
-+  return true;
-+}
-+
- /* Return TRUE if rtx X is immediate constant 0.0 */
- bool
- aarch64_float_const_zero_rtx_p (rtx x)
-@@ -4198,6 +4361,14 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
-       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
-     return CC_NZmode;
- 
-+  /* Similarly, comparisons of zero_extends from shorter modes can
-+     be performed using an ANDS with an immediate mask.  */
-+  if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
-+      && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
-+      && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
-+      && (code == EQ || code == NE))
-+    return CC_NZmode;
-+
-   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
-       && y == const0_rtx
-       && (code == EQ || code == NE || code == LT || code == GE)
-@@ -4225,14 +4396,6 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
-       && GET_CODE (x) == NEG)
-     return CC_Zmode;
- 
--  /* A compare of a mode narrower than SI mode against zero can be done
--     by extending the value in the comparison.  */
--  if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
--      && y == const0_rtx)
--    /* Only use sign-extension if we really need it.  */
--    return ((code == GT || code == GE || code == LE || code == LT)
--	    ? CC_SESWPmode : CC_ZESWPmode);
--
-   /* A test for unsigned overflow.  */
-   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
-       && code == NE
-@@ -4301,8 +4464,6 @@ aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
-       break;
- 
-     case CC_SWPmode:
--    case CC_ZESWPmode:
--    case CC_SESWPmode:
-       switch (comp_code)
- 	{
- 	case NE: return AARCH64_NE;
-@@ -4957,7 +5118,7 @@ aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
-   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
-     {
-       rtx base = XEXP (x, 0);
--      rtx offset_rtx XEXP (x, 1);
-+      rtx offset_rtx = XEXP (x, 1);
-       HOST_WIDE_INT offset = INTVAL (offset_rtx);
- 
-       if (GET_CODE (base) == PLUS)
-@@ -5015,120 +5176,6 @@ aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
-   return x;
- }
- 
--/* Try a machine-dependent way of reloading an illegitimate address
--   operand.  If we find one, push the reload and return the new rtx.  */
--
--rtx
--aarch64_legitimize_reload_address (rtx *x_p,
--				   machine_mode mode,
--				   int opnum, int type,
--				   int ind_levels ATTRIBUTE_UNUSED)
--{
--  rtx x = *x_p;
--
--  /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
--  if (aarch64_vect_struct_mode_p (mode)
--      && GET_CODE (x) == PLUS
--      && REG_P (XEXP (x, 0))
--      && CONST_INT_P (XEXP (x, 1)))
--    {
--      rtx orig_rtx = x;
--      x = copy_rtx (x);
--      push_reload (orig_rtx, NULL_RTX, x_p, NULL,
--		   BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
--		   opnum, (enum reload_type) type);
--      return x;
--    }
--
--  /* We must recognize output that we have already generated ourselves.  */
--  if (GET_CODE (x) == PLUS
--      && GET_CODE (XEXP (x, 0)) == PLUS
--      && REG_P (XEXP (XEXP (x, 0), 0))
--      && CONST_INT_P (XEXP (XEXP (x, 0), 1))
--      && CONST_INT_P (XEXP (x, 1)))
--    {
--      push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
--		   BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
--		   opnum, (enum reload_type) type);
--      return x;
--    }
--
--  /* We wish to handle large displacements off a base register by splitting
--     the addend across an add and the mem insn.  This can cut the number of
--     extra insns needed from 3 to 1.  It is only useful for load/store of a
--     single register with 12 bit offset field.  */
--  if (GET_CODE (x) == PLUS
--      && REG_P (XEXP (x, 0))
--      && CONST_INT_P (XEXP (x, 1))
--      && HARD_REGISTER_P (XEXP (x, 0))
--      && mode != TImode
--      && mode != TFmode
--      && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
--    {
--      HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
--      HOST_WIDE_INT low = val & 0xfff;
--      HOST_WIDE_INT high = val - low;
--      HOST_WIDE_INT offs;
--      rtx cst;
--      machine_mode xmode = GET_MODE (x);
--
--      /* In ILP32, xmode can be either DImode or SImode.  */
--      gcc_assert (xmode == DImode || xmode == SImode);
--
--      /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
--	 BLKmode alignment.  */
--      if (GET_MODE_SIZE (mode) == 0)
--	return NULL_RTX;
--
--      offs = low % GET_MODE_SIZE (mode);
--
--      /* Align misaligned offset by adjusting high part to compensate.  */
--      if (offs != 0)
--	{
--	  if (aarch64_uimm12_shift (high + offs))
--	    {
--	      /* Align down.  */
--	      low = low - offs;
--	      high = high + offs;
--	    }
--	  else
--	    {
--	      /* Align up.  */
--	      offs = GET_MODE_SIZE (mode) - offs;
--	      low = low + offs;
--	      high = high + (low & 0x1000) - offs;
--	      low &= 0xfff;
--	    }
--	}
--
--      /* Check for overflow.  */
--      if (high + low != val)
--	return NULL_RTX;
--
--      cst = GEN_INT (high);
--      if (!aarch64_uimm12_shift (high))
--	cst = force_const_mem (xmode, cst);
--
--      /* Reload high part into base reg, leaving the low part
--	 in the mem instruction.
--	 Note that replacing this gen_rtx_PLUS with plus_constant is
--	 wrong in this case because we rely on the
--	 (plus (plus reg c1) c2) structure being preserved so that
--	 XEXP (*p, 0) in push_reload below uses the correct term.  */
--      x = gen_rtx_PLUS (xmode,
--			gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
--			GEN_INT (low));
--
--      push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
--		   BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
--		   opnum, (enum reload_type) type);
--      return x;
--    }
--
--  return NULL_RTX;
--}
--
--
- /* Return the reload icode required for a constant pool in mode.  */
- static enum insn_code
- aarch64_constant_pool_reload_icode (machine_mode mode)
-@@ -5186,7 +5233,7 @@ aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
-   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
-       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
- 	  || targetm.vector_mode_supported_p (GET_MODE (x)))
--      && aarch64_nopcrelative_literal_loads)
-+      && !aarch64_pcrelative_literal_loads)
-     {
-       sri->icode = aarch64_constant_pool_reload_icode (mode);
-       return NO_REGS;
-@@ -5260,18 +5307,18 @@ aarch64_initial_elimination_offset (unsigned from, unsigned to)
-   if (to == HARD_FRAME_POINTER_REGNUM)
-     {
-       if (from == ARG_POINTER_REGNUM)
--	return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
-+	return cfun->machine->frame.hard_fp_offset;
- 
-       if (from == FRAME_POINTER_REGNUM)
--	return (cfun->machine->frame.hard_fp_offset
--		- cfun->machine->frame.saved_varargs_size);
-+	return cfun->machine->frame.hard_fp_offset
-+	       - cfun->machine->frame.locals_offset;
-     }
- 
-   if (to == STACK_POINTER_REGNUM)
-     {
-       if (from == FRAME_POINTER_REGNUM)
--	  return (cfun->machine->frame.frame_size
--		  - cfun->machine->frame.saved_varargs_size);
-+	  return cfun->machine->frame.frame_size
-+		 - cfun->machine->frame.locals_offset;
-     }
- 
-   return cfun->machine->frame.frame_size;
-@@ -5418,7 +5465,10 @@ aarch64_elf_asm_constructor (rtx symbol, int priority)
-   else
-     {
-       section *s;
--      char buf[18];
-+      /* While priority is known to be in range [0, 65535], so 18 bytes
-+         would be enough, the compiler might not know that.  To avoid
-+         -Wformat-truncation false positive, use a larger size.  */
-+      char buf[23];
-       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
-       s = get_section (buf, SECTION_WRITE, NULL);
-       switch_to_section (s);
-@@ -5435,7 +5485,10 @@ aarch64_elf_asm_destructor (rtx symbol, int priority)
-   else
-     {
-       section *s;
--      char buf[18];
-+      /* While priority is known to be in range [0, 65535], so 18 bytes
-+         would be enough, the compiler might not know that.  To avoid
-+         -Wformat-truncation false positive, use a larger size.  */
-+      char buf[23];
-       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
-       s = get_section (buf, SECTION_WRITE, NULL);
-       switch_to_section (s);
-@@ -5520,7 +5573,7 @@ aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
- static inline bool
- aarch64_can_use_per_function_literal_pools_p (void)
- {
--  return (!aarch64_nopcrelative_literal_loads
-+  return (aarch64_pcrelative_literal_loads
- 	  || aarch64_cmodel == AARCH64_CMODEL_LARGE);
- }
- 
-@@ -6139,6 +6192,19 @@ aarch64_extend_bitfield_pattern_p (rtx x)
-   return op;
- }
- 
-+/* Return true if the mask and a shift amount from an RTX of the form
-+   (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
-+   mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
-+
-+bool
-+aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
-+{
-+  return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
-+	 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
-+	 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
-+	 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
-+}
-+
- /* Calculate the cost of calculating X, storing it in *COST.  Result
-    is true if the total cost of the operation has now been calculated.  */
- static bool
-@@ -6404,10 +6470,6 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
-           /* TODO: A write to the CC flags possibly costs extra, this
- 	     needs encoding in the cost tables.  */
- 
--          /* CC_ZESWPmode supports zero extend for free.  */
--          if (mode == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
--            op0 = XEXP (op0, 0);
--
- 	  mode = GET_MODE (op0);
-           /* ANDS.  */
-           if (GET_CODE (op0) == AND)
-@@ -6717,17 +6779,31 @@ cost_plus:
- 
-       if (GET_MODE_CLASS (mode) == MODE_INT)
- 	{
--	  /* We possibly get the immediate for free, this is not
--	     modelled.  */
--	  if (CONST_INT_P (op1)
--	      && aarch64_bitmask_imm (INTVAL (op1), mode))
-+	  if (CONST_INT_P (op1))
- 	    {
--	      *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
-+	      /* We have a mask + shift version of a UBFIZ
-+		 i.e. the *andim_ashift<mode>_bfiz pattern.  */
-+	      if (GET_CODE (op0) == ASHIFT
-+		  && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
-+							  XEXP (op0, 1)))
-+		{
-+		  *cost += rtx_cost (XEXP (op0, 0), mode,
-+				     (enum rtx_code) code, 0, speed);
-+		  if (speed)
-+		    *cost += extra_cost->alu.bfx;
- 
--	      if (speed)
--		*cost += extra_cost->alu.logical;
-+		  return true;
-+		}
-+	      else if (aarch64_bitmask_imm (INTVAL (op1), mode))
-+		{
-+		/* We possibly get the immediate for free, this is not
-+		   modelled.  */
-+		  *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
-+		  if (speed)
-+		    *cost += extra_cost->alu.logical;
- 
--	      return true;
-+		  return true;
-+		}
- 	    }
- 	  else
- 	    {
-@@ -6831,11 +6907,12 @@ cost_plus:
- 	{
- 	  int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
- 
--	  if (!op_cost && speed)
--	    /* MOV.  */
--	    *cost += extra_cost->alu.extend;
--	  else
--	    /* Free, the cost is that of the SI mode operation.  */
-+	/* If OP_COST is non-zero, then the cost of the zero extend
-+	   is effectively the cost of the inner operation.  Otherwise
-+	   we have a MOV instruction and we take the cost from the MOV
-+	   itself.  This is true independently of whether we are
-+	   optimizing for space or time.  */
-+	  if (op_cost)
- 	    *cost = op_cost;
- 
- 	  return true;
-@@ -6865,8 +6942,8 @@ cost_plus:
- 	    }
- 	  else
- 	    {
--	      /* UXTB/UXTH.  */
--	      *cost += extra_cost->alu.extend;
-+	      /* We generate an AND instead of UXTB/UXTH.  */
-+	      *cost += extra_cost->alu.logical;
- 	    }
- 	}
-       return false;
-@@ -7349,7 +7426,8 @@ cost_plus:
-       break;
-     }
- 
--  if (dump_file && (dump_flags & TDF_DETAILS))
-+  if (dump_file
-+      && flag_aarch64_verbose_cost)
-     fprintf (dump_file,
-       "\nFailed to cost RTX.  Assuming default cost.\n");
- 
-@@ -7365,7 +7443,8 @@ aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
- {
-   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
- 
--  if (dump_file && (dump_flags & TDF_DETAILS))
-+  if (dump_file
-+      && flag_aarch64_verbose_cost)
-     {
-       print_rtl_single (dump_file, x);
-       fprintf (dump_file, "\n%s cost: %d (%s)\n",
-@@ -7445,12 +7524,12 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
-    to optimize 1.0/sqrt.  */
- 
- static bool
--use_rsqrt_p (void)
-+use_rsqrt_p (machine_mode mode)
- {
-   return (!flag_trapping_math
- 	  && flag_unsafe_math_optimizations
--	  && ((aarch64_tune_params.extra_tuning_flags
--	       & AARCH64_EXTRA_TUNE_APPROX_RSQRT)
-+	  && ((aarch64_tune_params.approx_modes->recip_sqrt
-+	       & AARCH64_APPROX_MODE (mode))
- 	      || flag_mrecip_low_precision_sqrt));
- }
- 
-@@ -7460,89 +7539,225 @@ use_rsqrt_p (void)
- static tree
- aarch64_builtin_reciprocal (tree fndecl)
- {
--  if (!use_rsqrt_p ())
-+  machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
-+
-+  if (!use_rsqrt_p (mode))
-     return NULL_TREE;
-   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
- }
- 
- typedef rtx (*rsqrte_type) (rtx, rtx);
- 
--/* Select reciprocal square root initial estimate
--   insn depending on machine mode.  */
-+/* Select reciprocal square root initial estimate insn depending on machine
-+   mode.  */
- 
--rsqrte_type
-+static rsqrte_type
- get_rsqrte_type (machine_mode mode)
- {
-   switch (mode)
-   {
--    case DFmode:   return gen_aarch64_rsqrte_df2;
--    case SFmode:   return gen_aarch64_rsqrte_sf2;
--    case V2DFmode: return gen_aarch64_rsqrte_v2df2;
--    case V2SFmode: return gen_aarch64_rsqrte_v2sf2;
--    case V4SFmode: return gen_aarch64_rsqrte_v4sf2;
-+    case DFmode:   return gen_aarch64_rsqrtedf;
-+    case SFmode:   return gen_aarch64_rsqrtesf;
-+    case V2DFmode: return gen_aarch64_rsqrtev2df;
-+    case V2SFmode: return gen_aarch64_rsqrtev2sf;
-+    case V4SFmode: return gen_aarch64_rsqrtev4sf;
-     default: gcc_unreachable ();
-   }
- }
- 
- typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
- 
--/* Select reciprocal square root Newton-Raphson step
--   insn depending on machine mode.  */
-+/* Select reciprocal square root series step insn depending on machine mode.  */
- 
--rsqrts_type
-+static rsqrts_type
- get_rsqrts_type (machine_mode mode)
- {
-   switch (mode)
-   {
--    case DFmode:   return gen_aarch64_rsqrts_df3;
--    case SFmode:   return gen_aarch64_rsqrts_sf3;
--    case V2DFmode: return gen_aarch64_rsqrts_v2df3;
--    case V2SFmode: return gen_aarch64_rsqrts_v2sf3;
--    case V4SFmode: return gen_aarch64_rsqrts_v4sf3;
-+    case DFmode:   return gen_aarch64_rsqrtsdf;
-+    case SFmode:   return gen_aarch64_rsqrtssf;
-+    case V2DFmode: return gen_aarch64_rsqrtsv2df;
-+    case V2SFmode: return gen_aarch64_rsqrtsv2sf;
-+    case V4SFmode: return gen_aarch64_rsqrtsv4sf;
-     default: gcc_unreachable ();
-   }
- }
- 
--/* Emit instruction sequence to compute the reciprocal square root using the
--   Newton-Raphson series.  Iterate over the series twice for SF
--   and thrice for DF.  */
-+/* Emit instruction sequence to compute either the approximate square root
-+   or its approximate reciprocal, depending on the flag RECP, and return
-+   whether the sequence was emitted or not.  */
- 
--void
--aarch64_emit_approx_rsqrt (rtx dst, rtx src)
-+bool
-+aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
- {
--  machine_mode mode = GET_MODE (src);
--  gcc_assert (
--    mode == SFmode || mode == V2SFmode || mode == V4SFmode
--	|| mode == DFmode || mode == V2DFmode);
-+  machine_mode mode = GET_MODE (dst);
-+
-+  if (GET_MODE_INNER (mode) == HFmode)
-+    return false;
- 
--  rtx xsrc = gen_reg_rtx (mode);
--  emit_move_insn (xsrc, src);
--  rtx x0 = gen_reg_rtx (mode);
-+  machine_mode mmsk = mode_for_vector
-+		        (int_mode_for_mode (GET_MODE_INNER (mode)),
-+			 GET_MODE_NUNITS (mode));
-+  bool use_approx_sqrt_p = (!recp
-+			    && (flag_mlow_precision_sqrt
-+			        || (aarch64_tune_params.approx_modes->sqrt
-+				    & AARCH64_APPROX_MODE (mode))));
-+  bool use_approx_rsqrt_p = (recp
-+			     && (flag_mrecip_low_precision_sqrt
-+				 || (aarch64_tune_params.approx_modes->recip_sqrt
-+				     & AARCH64_APPROX_MODE (mode))));
-+
-+  if (!flag_finite_math_only
-+      || flag_trapping_math
-+      || !flag_unsafe_math_optimizations
-+      || !(use_approx_sqrt_p || use_approx_rsqrt_p)
-+      || optimize_function_for_size_p (cfun))
-+    return false;
- 
--  emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
-+  rtx xmsk = gen_reg_rtx (mmsk);
-+  if (!recp)
-+    /* When calculating the approximate square root, compare the argument with
-+       0.0 and create a mask.  */
-+    emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
-+							  CONST0_RTX (mode)))));
- 
--  bool double_mode = (mode == DFmode || mode == V2DFmode);
-+  /* Estimate the approximate reciprocal square root.  */
-+  rtx xdst = gen_reg_rtx (mode);
-+  emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
- 
--  int iterations = double_mode ? 3 : 2;
-+  /* Iterate over the series twice for SF and thrice for DF.  */
-+  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
- 
--  /* Optionally iterate over the series one less time than otherwise.  */
--  if (flag_mrecip_low_precision_sqrt)
-+  /* Optionally iterate over the series once less for faster performance
-+     while sacrificing the accuracy.  */
-+  if ((recp && flag_mrecip_low_precision_sqrt)
-+      || (!recp && flag_mlow_precision_sqrt))
-     iterations--;
- 
--  for (int i = 0; i < iterations; ++i)
-+  /* Iterate over the series to calculate the approximate reciprocal square
-+     root.  */
-+  rtx x1 = gen_reg_rtx (mode);
-+  while (iterations--)
-     {
--      rtx x1 = gen_reg_rtx (mode);
-       rtx x2 = gen_reg_rtx (mode);
--      rtx x3 = gen_reg_rtx (mode);
--      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
-+      emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
-+
-+      emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
-+
-+      if (iterations > 0)
-+	emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
-+    }
-+
-+  if (!recp)
-+    {
-+      /* Qualify the approximate reciprocal square root when the argument is
-+	 0.0 by squashing the intermediary result to 0.0.  */
-+      rtx xtmp = gen_reg_rtx (mmsk);
-+      emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
-+					      gen_rtx_SUBREG (mmsk, xdst, 0)));
-+      emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
-+
-+      /* Calculate the approximate square root.  */
-+      emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
-+    }
-+
-+  /* Finalize the approximation.  */
-+  emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
-+
-+  return true;
-+}
-+
-+typedef rtx (*recpe_type) (rtx, rtx);
-+
-+/* Select reciprocal initial estimate insn depending on machine mode.  */
-+
-+static recpe_type
-+get_recpe_type (machine_mode mode)
-+{
-+  switch (mode)
-+  {
-+    case SFmode:   return (gen_aarch64_frecpesf);
-+    case V2SFmode: return (gen_aarch64_frecpev2sf);
-+    case V4SFmode: return (gen_aarch64_frecpev4sf);
-+    case DFmode:   return (gen_aarch64_frecpedf);
-+    case V2DFmode: return (gen_aarch64_frecpev2df);
-+    default:       gcc_unreachable ();
-+  }
-+}
-+
-+typedef rtx (*recps_type) (rtx, rtx, rtx);
-+
-+/* Select reciprocal series step insn depending on machine mode.  */
-+
-+static recps_type
-+get_recps_type (machine_mode mode)
-+{
-+  switch (mode)
-+  {
-+    case SFmode:   return (gen_aarch64_frecpssf);
-+    case V2SFmode: return (gen_aarch64_frecpsv2sf);
-+    case V4SFmode: return (gen_aarch64_frecpsv4sf);
-+    case DFmode:   return (gen_aarch64_frecpsdf);
-+    case V2DFmode: return (gen_aarch64_frecpsv2df);
-+    default:       gcc_unreachable ();
-+  }
-+}
-+
-+/* Emit the instruction sequence to compute the approximation for the division
-+   of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
-+
-+bool
-+aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
-+{
-+  machine_mode mode = GET_MODE (quo);
- 
--      emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
-+  if (GET_MODE_INNER (mode) == HFmode)
-+    return false;
-+
-+  bool use_approx_division_p = (flag_mlow_precision_div
-+			        || (aarch64_tune_params.approx_modes->division
-+				    & AARCH64_APPROX_MODE (mode)));
-+
-+  if (!flag_finite_math_only
-+      || flag_trapping_math
-+      || !flag_unsafe_math_optimizations
-+      || optimize_function_for_size_p (cfun)
-+      || !use_approx_division_p)
-+    return false;
-+
-+  /* Estimate the approximate reciprocal.  */
-+  rtx xrcp = gen_reg_rtx (mode);
-+  emit_insn ((*get_recpe_type (mode)) (xrcp, den));
-+
-+  /* Iterate over the series twice for SF and thrice for DF.  */
-+  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
-+
-+  /* Optionally iterate over the series once less for faster performance,
-+     while sacrificing the accuracy.  */
-+  if (flag_mlow_precision_div)
-+    iterations--;
- 
--      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
--      x0 = x1;
-+  /* Iterate over the series to calculate the approximate reciprocal.  */
-+  rtx xtmp = gen_reg_rtx (mode);
-+  while (iterations--)
-+    {
-+      emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
-+
-+      if (iterations > 0)
-+	emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
-+    }
-+
-+  if (num != CONST1_RTX (mode))
-+    {
-+      /* As the approximate reciprocal of DEN is already calculated, only
-+	 calculate the approximate division when NUM is not 1.0.  */
-+      rtx xnum = force_reg (mode, num);
-+      emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
-     }
- 
--  emit_move_insn (dst, x0);
-+  /* Finalize the approximation.  */
-+  emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
-+  return true;
- }
- 
- /* Return the number of instructions that can be issued per cycle.  */
-@@ -8046,32 +8261,37 @@ aarch64_override_options_after_change_1 (struct gcc_options *opts)
- 	opts->x_align_functions = aarch64_tune_params.function_align;
-     }
- 
--  /* If nopcrelative_literal_loads is set on the command line, this
-+  /* We default to no pc-relative literal loads.  */
-+
-+  aarch64_pcrelative_literal_loads = false;
-+
-+  /* If -mpc-relative-literal-loads is set on the command line, this
-      implies that the user asked for PC relative literal loads.  */
--  if (opts->x_nopcrelative_literal_loads == 1)
--    aarch64_nopcrelative_literal_loads = false;
-+  if (opts->x_pcrelative_literal_loads == 1)
-+    aarch64_pcrelative_literal_loads = true;
- 
--  /* If it is not set on the command line, we default to no pc
--     relative literal loads, unless the workaround for Cortex-A53
--     erratum 843419 is in effect.  */
-   /* This is PR70113. When building the Linux kernel with
-      CONFIG_ARM64_ERRATUM_843419, support for relocations
-      R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
-      removed from the kernel to avoid loading objects with possibly
--     offending sequences. With nopcrelative_literal_loads, we would
-+     offending sequences.  Without -mpc-relative-literal-loads we would
-      generate such relocations, preventing the kernel build from
-      succeeding.  */
--  if (opts->x_nopcrelative_literal_loads == 2
--      && !TARGET_FIX_ERR_A53_843419)
--    aarch64_nopcrelative_literal_loads = true;
-+  if (opts->x_pcrelative_literal_loads == 2
-+      && TARGET_FIX_ERR_A53_843419)
-+    aarch64_pcrelative_literal_loads = true;
- 
--  /* In the tiny memory model it makes no sense
--     to disallow non PC relative literal pool loads
--     as many other things will break anyway.  */
--  if (opts->x_nopcrelative_literal_loads
--      && (aarch64_cmodel == AARCH64_CMODEL_TINY
--	  || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC))
--    aarch64_nopcrelative_literal_loads = false;
-+  /* In the tiny memory model it makes no sense to disallow PC relative
-+     literal pool loads.  */
-+  if (aarch64_cmodel == AARCH64_CMODEL_TINY
-+      || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
-+    aarch64_pcrelative_literal_loads = true;
-+
-+  /* When enabling the lower precision Newton series for the square root, also
-+     enable it for the reciprocal square root, since the latter is an
-+     intermediary step for the former.  */
-+  if (flag_mlow_precision_sqrt)
-+    flag_mrecip_low_precision_sqrt = true;
- }
- 
- /* 'Unpack' up the internal tuning structs and update the options
-@@ -8374,9 +8594,6 @@ aarch64_override_options (void)
-      while processing functions with potential target attributes.  */
-   target_option_default_node = target_option_current_node
-       = build_target_option_node (&global_options);
--
--  aarch64_register_fma_steering ();
--
- }
- 
- /* Implement targetm.override_options_after_change.  */
-@@ -9279,15 +9496,18 @@ aarch64_classify_symbol (rtx x, rtx offset)
-       switch (aarch64_cmodel)
- 	{
- 	case AARCH64_CMODEL_TINY:
--	  /* When we retreive symbol + offset address, we have to make sure
-+	  /* When we retrieve symbol + offset address, we have to make sure
- 	     the offset does not cause overflow of the final address.  But
- 	     we have no way of knowing the address of symbol at compile time
- 	     so we can't accurately say if the distance between the PC and
- 	     symbol + offset is outside the addressible range of +/-1M in the
- 	     TINY code model.  So we rely on images not being greater than
- 	     1M and cap the offset at 1M and anything beyond 1M will have to
--	     be loaded using an alternative mechanism.  */
--	  if (SYMBOL_REF_WEAK (x)
-+	     be loaded using an alternative mechanism.  Furthermore if the
-+	     symbol is a weak reference to something that isn't known to
-+	     resolve to a symbol in this module, then force to memory.  */
-+	  if ((SYMBOL_REF_WEAK (x)
-+	       && !aarch64_symbol_binds_local_p (x))
- 	      || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
- 	    return SYMBOL_FORCE_TO_MEM;
- 	  return SYMBOL_TINY_ABSOLUTE;
-@@ -9295,7 +9515,8 @@ aarch64_classify_symbol (rtx x, rtx offset)
- 	case AARCH64_CMODEL_SMALL:
- 	  /* Same reasoning as the tiny code model, but the offset cap here is
- 	     4G.  */
--	  if (SYMBOL_REF_WEAK (x)
-+	  if ((SYMBOL_REF_WEAK (x)
-+	       && !aarch64_symbol_binds_local_p (x))
- 	      || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
- 			    HOST_WIDE_INT_C (4294967264)))
- 	    return SYMBOL_FORCE_TO_MEM;
-@@ -9317,8 +9538,7 @@ aarch64_classify_symbol (rtx x, rtx offset)
- 	  /* This is alright even in PIC code as the constant
- 	     pool reference is always PC relative and within
- 	     the same translation unit.  */
--	  if (nopcrelative_literal_loads
--	      && CONSTANT_POOL_ADDRESS_P (x))
-+	  if (CONSTANT_POOL_ADDRESS_P (x))
- 	    return SYMBOL_SMALL_ABSOLUTE;
- 	  else
- 	    return SYMBOL_FORCE_TO_MEM;
-@@ -9454,6 +9674,13 @@ aarch64_build_builtin_va_list (void)
- 			FIELD_DECL, get_identifier ("__vr_offs"),
- 			integer_type_node);
- 
-+  /* Tell tree-stdarg pass about our internal offset fields.
-+     NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
-+     purpose to identify whether the code is updating va_list internal
-+     offset fields through irregular way.  */
-+  va_list_gpr_counter_field = f_groff;
-+  va_list_fpr_counter_field = f_vroff;
-+
-   DECL_ARTIFICIAL (f_stack) = 1;
-   DECL_ARTIFICIAL (f_grtop) = 1;
-   DECL_ARTIFICIAL (f_vrtop) = 1;
-@@ -9486,15 +9713,17 @@ aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
-   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
-   tree stack, grtop, vrtop, groff, vroff;
-   tree t;
--  int gr_save_area_size;
--  int vr_save_area_size;
-+  int gr_save_area_size = cfun->va_list_gpr_size;
-+  int vr_save_area_size = cfun->va_list_fpr_size;
-   int vr_offset;
- 
-   cum = &crtl->args.info;
--  gr_save_area_size
--    = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
--  vr_save_area_size
--    = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
-+  if (cfun->va_list_gpr_size)
-+    gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
-+			     cfun->va_list_gpr_size);
-+  if (cfun->va_list_fpr_size)
-+    vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
-+			     * UNITS_PER_VREG, cfun->va_list_fpr_size);
- 
-   if (!TARGET_FLOAT)
-     {
-@@ -9823,7 +10052,8 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
- {
-   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
-   CUMULATIVE_ARGS local_cum;
--  int gr_saved, vr_saved;
-+  int gr_saved = cfun->va_list_gpr_size;
-+  int vr_saved = cfun->va_list_fpr_size;
- 
-   /* The caller has advanced CUM up to, but not beyond, the last named
-      argument.  Advance a local copy of CUM past the last "real" named
-@@ -9831,9 +10061,14 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
-   local_cum = *cum;
-   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
- 
--  /* Found out how many registers we need to save.  */
--  gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
--  vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
-+  /* Found out how many registers we need to save.
-+     Honor tree-stdvar analysis results.  */
-+  if (cfun->va_list_gpr_size)
-+    gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
-+		    cfun->va_list_gpr_size / UNITS_PER_WORD);
-+  if (cfun->va_list_fpr_size)
-+    vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
-+		    cfun->va_list_fpr_size / UNITS_PER_VREG);
- 
-   if (!TARGET_FLOAT)
-     {
-@@ -9861,7 +10096,7 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
- 	  /* We can't use move_block_from_reg, because it will use
- 	     the wrong mode, storing D regs only.  */
- 	  machine_mode mode = TImode;
--	  int off, i;
-+	  int off, i, vr_start;
- 
- 	  /* Set OFF to the offset from virtual_incoming_args_rtx of
- 	     the first vector register.  The VR save area lies below
-@@ -9870,14 +10105,15 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
- 			   STACK_BOUNDARY / BITS_PER_UNIT);
- 	  off -= vr_saved * UNITS_PER_VREG;
- 
--	  for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
-+	  vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
-+	  for (i = 0; i < vr_saved; ++i)
- 	    {
- 	      rtx ptr, mem;
- 
- 	      ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
- 	      mem = gen_frame_mem (mode, ptr);
- 	      set_mem_alias_set (mem, get_varargs_alias_set ());
--	      aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
-+	      aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
- 	      off += UNITS_PER_VREG;
- 	    }
- 	}
-@@ -10839,33 +11075,6 @@ aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
- 		      gen_rtx_REG (mode, rsrc + count - i - 1));
- }
- 
--/* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
--   one of VSTRUCT modes: OI, CI or XI.  */
--int
--aarch64_simd_attr_length_move (rtx_insn *insn)
--{
--  machine_mode mode;
--
--  extract_insn_cached (insn);
--
--  if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
--    {
--      mode = GET_MODE (recog_data.operand[0]);
--      switch (mode)
--	{
--	case OImode:
--	  return 8;
--	case CImode:
--	  return 12;
--	case XImode:
--	  return 16;
--	default:
--	  gcc_unreachable ();
--	}
--    }
--  return 4;
--}
--
- /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
-    one of VSTRUCT modes: OI, CI, or XI.  */
- int
-@@ -10899,6 +11108,37 @@ aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
-   return true;
- }
- 
-+/* Return true if the vector misalignment factor is supported by the
-+   target.  */
-+static bool
-+aarch64_builtin_support_vector_misalignment (machine_mode mode,
-+					     const_tree type, int misalignment,
-+					     bool is_packed)
-+{
-+  if (TARGET_SIMD && STRICT_ALIGNMENT)
-+    {
-+      /* Return if movmisalign pattern is not supported for this mode.  */
-+      if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
-+        return false;
-+
-+      if (misalignment == -1)
-+	{
-+	  /* Misalignment factor is unknown at compile time but we know
-+	     it's word aligned.  */
-+	  if (aarch64_simd_vector_alignment_reachable (type, is_packed))
-+            {
-+              int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
-+
-+              if (element_size != 64)
-+                return true;
-+            }
-+	  return false;
-+	}
-+    }
-+  return default_builtin_support_vector_misalignment (mode, type, misalignment,
-+						      is_packed);
-+}
-+
- /* If VALS is a vector constant that can be loaded into a register
-    using DUP, generate instructions to do so and return an RTX to
-    assign to the register.  Otherwise return NULL_RTX.  */
-@@ -11947,12 +12187,11 @@ aarch64_output_simd_mov_immediate (rtx const_vector,
-         info.value = GEN_INT (0);
-       else
- 	{
--#define buf_size 20
-+	  const unsigned int buf_size = 20;
- 	  char float_buf[buf_size] = {'\0'};
- 	  real_to_decimal_for_mode (float_buf,
- 				    CONST_DOUBLE_REAL_VALUE (info.value),
- 				    buf_size, buf_size, 1, mode);
--#undef buf_size
- 
- 	  if (lane_count == 1)
- 	    snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
-@@ -12186,6 +12425,8 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d)
- 	case V4SImode: gen = gen_aarch64_trn2v4si; break;
- 	case V2SImode: gen = gen_aarch64_trn2v2si; break;
- 	case V2DImode: gen = gen_aarch64_trn2v2di; break;
-+	case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
-+	case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
- 	case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
- 	case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
- 	case V2DFmode: gen = gen_aarch64_trn2v2df; break;
-@@ -12204,6 +12445,8 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d)
- 	case V4SImode: gen = gen_aarch64_trn1v4si; break;
- 	case V2SImode: gen = gen_aarch64_trn1v2si; break;
- 	case V2DImode: gen = gen_aarch64_trn1v2di; break;
-+	case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
-+	case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
- 	case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
- 	case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
- 	case V2DFmode: gen = gen_aarch64_trn1v2df; break;
-@@ -12269,6 +12512,8 @@ aarch64_evpc_uzp (struct expand_vec_perm_d *d)
- 	case V4SImode: gen = gen_aarch64_uzp2v4si; break;
- 	case V2SImode: gen = gen_aarch64_uzp2v2si; break;
- 	case V2DImode: gen = gen_aarch64_uzp2v2di; break;
-+	case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
-+	case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
- 	case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
- 	case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
- 	case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
-@@ -12287,6 +12532,8 @@ aarch64_evpc_uzp (struct expand_vec_perm_d *d)
- 	case V4SImode: gen = gen_aarch64_uzp1v4si; break;
- 	case V2SImode: gen = gen_aarch64_uzp1v2si; break;
- 	case V2DImode: gen = gen_aarch64_uzp1v2di; break;
-+	case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
-+	case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
- 	case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
- 	case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
- 	case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
-@@ -12357,6 +12604,8 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d)
- 	case V4SImode: gen = gen_aarch64_zip2v4si; break;
- 	case V2SImode: gen = gen_aarch64_zip2v2si; break;
- 	case V2DImode: gen = gen_aarch64_zip2v2di; break;
-+	case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
-+	case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
- 	case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
- 	case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
- 	case V2DFmode: gen = gen_aarch64_zip2v2df; break;
-@@ -12375,6 +12624,8 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d)
- 	case V4SImode: gen = gen_aarch64_zip1v4si; break;
- 	case V2SImode: gen = gen_aarch64_zip1v2si; break;
- 	case V2DImode: gen = gen_aarch64_zip1v2di; break;
-+	case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
-+	case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
- 	case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
- 	case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
- 	case V2DFmode: gen = gen_aarch64_zip1v2df; break;
-@@ -12419,6 +12670,8 @@ aarch64_evpc_ext (struct expand_vec_perm_d *d)
-     case V8HImode: gen = gen_aarch64_extv8hi; break;
-     case V2SImode: gen = gen_aarch64_extv2si; break;
-     case V4SImode: gen = gen_aarch64_extv4si; break;
-+    case V4HFmode: gen = gen_aarch64_extv4hf; break;
-+    case V8HFmode: gen = gen_aarch64_extv8hf; break;
-     case V2SFmode: gen = gen_aarch64_extv2sf; break;
-     case V4SFmode: gen = gen_aarch64_extv4sf; break;
-     case V2DImode: gen = gen_aarch64_extv2di; break;
-@@ -12494,6 +12747,8 @@ aarch64_evpc_rev (struct expand_vec_perm_d *d)
- 	case V2SImode: gen = gen_aarch64_rev64v2si;  break;
- 	case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
- 	case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
-+	case V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
-+	case V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
- 	default:
- 	  return false;
- 	}
-@@ -12737,24 +12992,6 @@ aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
-   return ret;
- }
- 
--/* Implement target hook CANNOT_CHANGE_MODE_CLASS.  */
--bool
--aarch64_cannot_change_mode_class (machine_mode from,
--				  machine_mode to,
--				  enum reg_class rclass)
--{
--  /* We cannot allow word_mode subregs of full vector modes.
--     Otherwise the middle-end will assume it's ok to store to
--     (subreg:DI (reg:TI 100) 0) in order to modify only the low 64 bits
--     of the 128-bit register.  However, after reload the subreg will
--     be dropped leaving a plain DImode store.  See PR67609 for a more
--     detailed dicussion.  In all other cases, we want to be permissive
--     and return false.  */
--  return (reg_classes_intersect_p (FP_REGS, rclass)
--	  && GET_MODE_SIZE (to) == UNITS_PER_WORD
--	  && GET_MODE_SIZE (from) > UNITS_PER_WORD);
--}
--
- rtx
- aarch64_reverse_mask (enum machine_mode mode)
- {
-@@ -12776,7 +13013,14 @@ aarch64_reverse_mask (enum machine_mode mode)
-   return force_reg (V16QImode, mask);
- }
- 
--/* Implement MODES_TIEABLE_P.  */
-+/* Implement MODES_TIEABLE_P.  In principle we should always return true.
-+   However due to issues with register allocation it is preferable to avoid
-+   tieing integer scalar and FP scalar modes.  Executing integer operations
-+   in general registers is better than treating them as scalar vector
-+   operations.  This reduces latency and avoids redundant int<->FP moves.
-+   So tie modes if they are either the same class, or vector modes with
-+   other vector modes, vector structs or any scalar mode.
-+*/
- 
- bool
- aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
-@@ -12787,9 +13031,12 @@ aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
-   /* We specifically want to allow elements of "structure" modes to
-      be tieable to the structure.  This more general condition allows
-      other rarer situations too.  */
--  if (TARGET_SIMD
--      && aarch64_vector_mode_p (mode1)
--      && aarch64_vector_mode_p (mode2))
-+  if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
-+    return true;
-+
-+  /* Also allow any scalar modes with vectors.  */
-+  if (aarch64_vector_mode_supported_p (mode1)
-+      || aarch64_vector_mode_supported_p (mode2))
-     return true;
- 
-   return false;
-@@ -12953,6 +13200,63 @@ aarch64_expand_movmem (rtx *operands)
-   return true;
- }
- 
-+/* Split a DImode store of a CONST_INT SRC to MEM DST as two
-+   SImode stores.  Handle the case when the constant has identical
-+   bottom and top halves.  This is beneficial when the two stores can be
-+   merged into an STP and we avoid synthesising potentially expensive
-+   immediates twice.  Return true if such a split is possible.  */
-+
-+bool
-+aarch64_split_dimode_const_store (rtx dst, rtx src)
-+{
-+  rtx lo = gen_lowpart (SImode, src);
-+  rtx hi = gen_highpart_mode (SImode, DImode, src);
-+
-+  bool size_p = optimize_function_for_size_p (cfun);
-+
-+  if (!rtx_equal_p (lo, hi))
-+    return false;
-+
-+  unsigned int orig_cost
-+    = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
-+  unsigned int lo_cost
-+    = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
-+
-+  /* We want to transform:
-+     MOV	x1, 49370
-+     MOVK	x1, 0x140, lsl 16
-+     MOVK	x1, 0xc0da, lsl 32
-+     MOVK	x1, 0x140, lsl 48
-+     STR	x1, [x0]
-+   into:
-+     MOV	w1, 49370
-+     MOVK	w1, 0x140, lsl 16
-+     STP	w1, w1, [x0]
-+   So we want to perform this only when we save two instructions
-+   or more.  When optimizing for size, however, accept any code size
-+   savings we can.  */
-+  if (size_p && orig_cost <= lo_cost)
-+    return false;
-+
-+  if (!size_p
-+      && (orig_cost <= lo_cost + 1))
-+    return false;
-+
-+  rtx mem_lo = adjust_address (dst, SImode, 0);
-+  if (!aarch64_mem_pair_operand (mem_lo, SImode))
-+    return false;
-+
-+  rtx tmp_reg = gen_reg_rtx (SImode);
-+  aarch64_expand_mov_immediate (tmp_reg, lo);
-+  rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
-+  /* Don't emit an explicit store pair as this may not be always profitable.
-+     Let the sched-fusion logic decide whether to merge them.  */
-+  emit_move_insn (mem_lo, tmp_reg);
-+  emit_move_insn (mem_hi, tmp_reg);
-+
-+  return true;
-+}
-+
- /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
- 
- static unsigned HOST_WIDE_INT
-@@ -13305,6 +13609,14 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
-   return false;
- }
- 
-+/* Return true iff the instruction fusion described by OP is enabled.  */
-+
-+bool
-+aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
-+{
-+  return (aarch64_tune_params.fusible_ops & op) != 0;
-+}
-+
- /* If MEM is in the form of [base+offset], extract the two parts
-    of address and set to BASE and OFFSET, otherwise return false
-    after clearing BASE and OFFSET.  */
-@@ -13449,6 +13761,26 @@ aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
-   return;
- }
- 
-+/* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
-+   Adjust priority of sha1h instructions so they are scheduled before
-+   other SHA1 instructions.  */
-+
-+static int
-+aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
-+{
-+  rtx x = PATTERN (insn);
-+
-+  if (GET_CODE (x) == SET)
-+    {
-+      x = SET_SRC (x);
-+
-+      if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
-+	return priority + 10;
-+    }
-+
-+  return priority;
-+}
-+
- /* Given OPERANDS of consecutive load/store, check if we can merge
-    them into ldp/stp.  LOAD is true if they are load instructions.
-    MODE is the mode of memory operands.  */
-@@ -13483,6 +13815,15 @@ aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
-   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
-     return false;
- 
-+  /* If we have SImode and slow unaligned ldp,
-+     check the alignment to be at least 8 byte. */
-+  if (mode == SImode
-+      && (aarch64_tune_params.extra_tuning_flags
-+          & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
-+      && !optimize_size
-+      && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
-+    return false;
-+
-   /* Check if the addresses are in the form of [base+offset].  */
-   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
-   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
-@@ -13642,6 +13983,15 @@ aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
- 	return false;
-     }
- 
-+  /* If we have SImode and slow unaligned ldp,
-+     check the alignment to be at least 8 byte. */
-+  if (mode == SImode
-+      && (aarch64_tune_params.extra_tuning_flags
-+          & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
-+      && !optimize_size
-+      && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
-+    return false;
-+
-   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
-     rclass_1 = FP_REGS;
-   else
-@@ -13877,13 +14227,13 @@ aarch64_promoted_type (const_tree t)
- /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
- 
- static bool
--aarch64_optab_supported_p (int op, machine_mode, machine_mode,
-+aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
- 			   optimization_type opt_type)
- {
-   switch (op)
-     {
-     case rsqrt_optab:
--      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
-+      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
- 
-     default:
-       return true;
-@@ -14017,6 +14367,10 @@ aarch64_optab_supported_p (int op, machine_mode, machine_mode,
- #undef TARGET_LEGITIMATE_CONSTANT_P
- #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
- 
-+#undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
-+#define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
-+  aarch64_legitimize_address_displacement
-+
- #undef TARGET_LIBGCC_CMP_RETURN_MODE
- #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
- 
-@@ -14119,6 +14473,10 @@ aarch64_optab_supported_p (int op, machine_mode, machine_mode,
- #undef TARGET_VECTOR_MODE_SUPPORTED_P
- #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
- 
-+#undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
-+#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
-+  aarch64_builtin_support_vector_misalignment
-+
- #undef TARGET_ARRAY_MODE_SUPPORTED_P
- #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
- 
-@@ -14196,6 +14554,9 @@ aarch64_optab_supported_p (int op, machine_mode, machine_mode,
- #undef TARGET_CAN_USE_DOLOOP_P
- #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
- 
-+#undef TARGET_SCHED_ADJUST_PRIORITY
-+#define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
-+
- #undef TARGET_SCHED_MACRO_FUSION_P
- #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
- 
-@@ -14220,6 +14581,9 @@ aarch64_optab_supported_p (int op, machine_mode, machine_mode,
- #undef TARGET_OPTAB_SUPPORTED_P
- #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
- 
-+#undef TARGET_OMIT_STRUCT_RETURN_REG
-+#define TARGET_OMIT_STRUCT_RETURN_REG true
-+
- struct gcc_target targetm = TARGET_INITIALIZER;
- 
- #include "gt-aarch64.h"
---- a/src/gcc/config/aarch64/aarch64.h
-+++ b/src/gcc/config/aarch64/aarch64.h
-@@ -132,9 +132,14 @@ extern unsigned aarch64_architecture_version;
- #define AARCH64_FL_FP         (1 << 1)	/* Has FP.  */
- #define AARCH64_FL_CRYPTO     (1 << 2)	/* Has crypto.  */
- #define AARCH64_FL_CRC        (1 << 3)	/* Has CRC.  */
--/* ARMv8.1 architecture extensions.  */
-+/* ARMv8.1-A architecture extensions.  */
- #define AARCH64_FL_LSE	      (1 << 4)  /* Has Large System Extensions.  */
--#define AARCH64_FL_V8_1	      (1 << 5)  /* Has ARMv8.1 extensions.  */
-+#define AARCH64_FL_V8_1	      (1 << 5)  /* Has ARMv8.1-A extensions.  */
-+/* ARMv8.2-A architecture extensions.  */
-+#define AARCH64_FL_V8_2	      (1 << 8)  /* Has ARMv8.2-A features.  */
-+#define AARCH64_FL_F16	      (1 << 9)  /* Has ARMv8.2-A FP16 extensions.  */
-+/* ARMv8.3-A architecture extensions.  */
-+#define AARCH64_FL_V8_3	      (1 << 10)  /* Has ARMv8.3-A features.  */
- 
- /* Has FP and SIMD.  */
- #define AARCH64_FL_FPSIMD     (AARCH64_FL_FP | AARCH64_FL_SIMD)
-@@ -146,6 +151,10 @@ extern unsigned aarch64_architecture_version;
- #define AARCH64_FL_FOR_ARCH8       (AARCH64_FL_FPSIMD)
- #define AARCH64_FL_FOR_ARCH8_1			       \
-   (AARCH64_FL_FOR_ARCH8 | AARCH64_FL_LSE | AARCH64_FL_CRC | AARCH64_FL_V8_1)
-+#define AARCH64_FL_FOR_ARCH8_2			\
-+  (AARCH64_FL_FOR_ARCH8_1 | AARCH64_FL_V8_2)
-+#define AARCH64_FL_FOR_ARCH8_3			\
-+  (AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_V8_3)
- 
- /* Macros to test ISA flags.  */
- 
-@@ -155,6 +164,9 @@ extern unsigned aarch64_architecture_version;
- #define AARCH64_ISA_SIMD           (aarch64_isa_flags & AARCH64_FL_SIMD)
- #define AARCH64_ISA_LSE		   (aarch64_isa_flags & AARCH64_FL_LSE)
- #define AARCH64_ISA_RDMA	   (aarch64_isa_flags & AARCH64_FL_V8_1)
-+#define AARCH64_ISA_V8_2	   (aarch64_isa_flags & AARCH64_FL_V8_2)
-+#define AARCH64_ISA_F16		   (aarch64_isa_flags & AARCH64_FL_F16)
-+#define AARCH64_ISA_V8_3	   (aarch64_isa_flags & AARCH64_FL_V8_3)
- 
- /* Crypto is an optional extension to AdvSIMD.  */
- #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
-@@ -165,6 +177,13 @@ extern unsigned aarch64_architecture_version;
- /* Atomic instructions that can be enabled through the +lse extension.  */
- #define TARGET_LSE (AARCH64_ISA_LSE)
- 
-+/* ARMv8.2-A FP16 support that can be enabled through the +fp16 extension.  */
-+#define TARGET_FP_F16INST (TARGET_FLOAT && AARCH64_ISA_F16)
-+#define TARGET_SIMD_F16INST (TARGET_SIMD && AARCH64_ISA_F16)
-+
-+/* ARMv8.3-A features.  */
-+#define TARGET_ARMV8_3	(AARCH64_ISA_V8_3)
-+
- /* Make sure this is always defined so we don't have to check for ifdefs
-    but rather use normal ifs.  */
- #ifndef TARGET_FIX_ERR_A53_835769_DEFAULT
-@@ -193,7 +212,7 @@ extern unsigned aarch64_architecture_version;
-   ((aarch64_fix_a53_err843419 == 2)	\
-   ? TARGET_FIX_ERR_A53_843419_DEFAULT : aarch64_fix_a53_err843419)
- 
--/* ARMv8.1 Adv.SIMD support.  */
-+/* ARMv8.1-A Adv.SIMD support.  */
- #define TARGET_SIMD_RDMA (TARGET_SIMD && AARCH64_ISA_RDMA)
- 
- /* Standard register usage.  */
-@@ -539,11 +558,14 @@ struct GTY (()) aarch64_frame
-      STACK_BOUNDARY.  */
-   HOST_WIDE_INT saved_varargs_size;
- 
-+  /* The size of the saved callee-save int/FP registers.  */
-+
-   HOST_WIDE_INT saved_regs_size;
--  /* Padding if needed after the all the callee save registers have
--     been saved.  */
--  HOST_WIDE_INT padding0;
--  HOST_WIDE_INT hardfp_offset;	/* HARD_FRAME_POINTER_REGNUM */
-+
-+  /* Offset from the base of the frame (incomming SP) to the
-+     top of the locals area.  This value is always a multiple of
-+     STACK_BOUNDARY.  */
-+  HOST_WIDE_INT locals_offset;
- 
-   /* Offset from the base of the frame (incomming SP) to the
-      hard_frame_pointer.  This value is always a multiple of
-@@ -553,12 +575,25 @@ struct GTY (()) aarch64_frame
-   /* The size of the frame.  This value is the offset from base of the
-    * frame (incomming SP) to the stack_pointer.  This value is always
-    * a multiple of STACK_BOUNDARY.  */
-+  HOST_WIDE_INT frame_size;
-+
-+  /* The size of the initial stack adjustment before saving callee-saves.  */
-+  HOST_WIDE_INT initial_adjust;
-+
-+  /* The writeback value when pushing callee-save registers.
-+     It is zero when no push is used.  */
-+  HOST_WIDE_INT callee_adjust;
-+
-+  /* The offset from SP to the callee-save registers after initial_adjust.
-+     It may be non-zero if no push is used (ie. callee_adjust == 0).  */
-+  HOST_WIDE_INT callee_offset;
-+
-+  /* The size of the stack adjustment after saving callee-saves.  */
-+  HOST_WIDE_INT final_adjust;
- 
-   unsigned wb_candidate1;
-   unsigned wb_candidate2;
- 
--  HOST_WIDE_INT frame_size;
--
-   bool laid_out;
- };
- 
-@@ -652,21 +687,6 @@ typedef struct
- 
- #define CONSTANT_ADDRESS_P(X)		aarch64_constant_address_p(X)
- 
--/* Try a machine-dependent way of reloading an illegitimate address
--   operand.  If we find one, push the reload and jump to WIN.  This
--   macro is used in only one place: `find_reloads_address' in reload.c.  */
--
--#define LEGITIMIZE_RELOAD_ADDRESS(X, MODE, OPNUM, TYPE, IND_L, WIN)	     \
--do {									     \
--  rtx new_x = aarch64_legitimize_reload_address (&(X), MODE, OPNUM, TYPE,    \
--						 IND_L);		     \
--  if (new_x)								     \
--    {									     \
--      X = new_x;							     \
--      goto WIN;								     \
--    }									     \
--} while (0)
--
- #define REGNO_OK_FOR_BASE_P(REGNO)	\
-   aarch64_regno_ok_for_base_p (REGNO, true)
- 
-@@ -722,7 +742,12 @@ do {									     \
- #define USE_STORE_PRE_INCREMENT(MODE)   0
- #define USE_STORE_PRE_DECREMENT(MODE)   0
- 
--/* ?? #define WORD_REGISTER_OPERATIONS  */
-+/* WORD_REGISTER_OPERATIONS does not hold for AArch64.
-+   The assigned word_mode is DImode but operations narrower than SImode
-+   behave as 32-bit operations if using the W-form of the registers rather
-+   than as word_mode (64-bit) operations as WORD_REGISTER_OPERATIONS
-+   expects.  */
-+#define WORD_REGISTER_OPERATIONS 0
- 
- /* Define if loading from memory in MODE, an integral mode narrower than
-    BITS_PER_WORD will either zero-extend or sign-extend.  The value of this
-@@ -842,10 +867,7 @@ do {									     \
-   extern void  __aarch64_sync_cache_range (void *, void *);	\
-   __aarch64_sync_cache_range (beg, end)
- 
--#define CANNOT_CHANGE_MODE_CLASS(FROM, TO, CLASS)	\
--  aarch64_cannot_change_mode_class (FROM, TO, CLASS)
--
--#define SHIFT_COUNT_TRUNCATED !TARGET_SIMD
-+#define SHIFT_COUNT_TRUNCATED (!TARGET_SIMD)
- 
- /* Choose appropriate mode for caller saves, so we do the minimum
-    required size of load/store.  */
---- a/src/gcc/config/aarch64/aarch64.md
-+++ b/src/gcc/config/aarch64/aarch64.md
-@@ -75,6 +75,8 @@
-     UNSPEC_CRC32H
-     UNSPEC_CRC32W
-     UNSPEC_CRC32X
-+    UNSPEC_FCVTZS
-+    UNSPEC_FCVTZU
-     UNSPEC_URECPE
-     UNSPEC_FRECPE
-     UNSPEC_FRECPS
-@@ -105,6 +107,7 @@
-     UNSPEC_NOP
-     UNSPEC_PRLG_STK
-     UNSPEC_RBIT
-+    UNSPEC_SCVTF
-     UNSPEC_SISD_NEG
-     UNSPEC_SISD_SSHL
-     UNSPEC_SISD_USHL
-@@ -122,6 +125,7 @@
-     UNSPEC_TLSLE24
-     UNSPEC_TLSLE32
-     UNSPEC_TLSLE48
-+    UNSPEC_UCVTF
-     UNSPEC_USHL_2S
-     UNSPEC_VSTRUCTDUMMY
-     UNSPEC_SP_SET
-@@ -837,13 +841,6 @@
- 	   || aarch64_is_noplt_call_p (callee)))
-       XEXP (operands[0], 0) = force_reg (Pmode, callee);
- 
--    /* FIXME: This is a band-aid.  Need to analyze why expand_expr_addr_expr
--       is generating an SImode symbol reference.  See PR 64971.  */
--    if (TARGET_ILP32
--	&& GET_CODE (XEXP (operands[0], 0)) == SYMBOL_REF
--	&& GET_MODE (XEXP (operands[0], 0)) == SImode)
--      XEXP (operands[0], 0) = convert_memory_address (Pmode,
--						      XEXP (operands[0], 0));
-     if (operands[2] == NULL_RTX)
-       operands[2] = const0_rtx;
- 
-@@ -875,14 +872,6 @@
- 	   || aarch64_is_noplt_call_p (callee)))
-       XEXP (operands[1], 0) = force_reg (Pmode, callee);
- 
--    /* FIXME: This is a band-aid.  Need to analyze why expand_expr_addr_expr
--       is generating an SImode symbol reference.  See PR 64971.  */
--    if (TARGET_ILP32
--	&& GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
--	&& GET_MODE (XEXP (operands[1], 0)) == SImode)
--      XEXP (operands[1], 0) = convert_memory_address (Pmode,
--						      XEXP (operands[1], 0));
--
-     if (operands[3] == NULL_RTX)
-       operands[3] = const0_rtx;
- 
-@@ -1003,6 +992,11 @@
- 	(match_operand:GPI 1 "general_operand" ""))]
-   ""
-   "
-+    if (MEM_P (operands[0]) && CONST_INT_P (operands[1])
-+	&& <MODE>mode == DImode
-+	&& aarch64_split_dimode_const_store (operands[0], operands[1]))
-+      DONE;
-+
-     if (GET_CODE (operands[0]) == MEM && operands[1] != const0_rtx)
-       operands[1] = force_reg (<MODE>mode, operands[1]);
- 
-@@ -1160,11 +1154,12 @@
- )
- 
- (define_insn "*movhf_aarch64"
--  [(set (match_operand:HF 0 "nonimmediate_operand" "=w, ?r,w,w,m,r,m ,r")
--	(match_operand:HF 1 "general_operand"      "?rY, w,w,m,w,m,rY,r"))]
-+  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  ,?r,w,w,m,r,m ,r")
-+	(match_operand:HF 1 "general_operand"      "Y ,?rY, w,w,m,w,m,rY,r"))]
-   "TARGET_FLOAT && (register_operand (operands[0], HFmode)
-     || aarch64_reg_or_fp_zero (operands[1], HFmode))"
-   "@
-+   movi\\t%0.4h, #0
-    mov\\t%0.h[0], %w1
-    umov\\t%w0, %1.h[0]
-    mov\\t%0.h[0], %1.h[0]
-@@ -1173,18 +1168,18 @@
-    ldrh\\t%w0, %1
-    strh\\t%w1, %0
-    mov\\t%w0, %w1"
--  [(set_attr "type" "neon_from_gp,neon_to_gp,neon_move,\
-+  [(set_attr "type" "neon_move,neon_from_gp,neon_to_gp,neon_move,\
-                      f_loads,f_stores,load1,store1,mov_reg")
--   (set_attr "simd" "yes,yes,yes,*,*,*,*,*")
--   (set_attr "fp"   "*,*,*,yes,yes,*,*,*")]
-+   (set_attr "simd" "yes,yes,yes,yes,*,*,*,*,*")]
- )
- 
- (define_insn "*movsf_aarch64"
--  [(set (match_operand:SF 0 "nonimmediate_operand" "=w, ?r,w,w  ,w,m,r,m ,r")
--	(match_operand:SF 1 "general_operand"      "?rY, w,w,Ufc,m,w,m,rY,r"))]
-+  [(set (match_operand:SF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w,m,r,m ,r")
-+	(match_operand:SF 1 "general_operand"      "Y ,?rY, w,w,Ufc,m,w,m,rY,r"))]
-   "TARGET_FLOAT && (register_operand (operands[0], SFmode)
-     || aarch64_reg_or_fp_zero (operands[1], SFmode))"
-   "@
-+   movi\\t%0.2s, #0
-    fmov\\t%s0, %w1
-    fmov\\t%w0, %s1
-    fmov\\t%s0, %s1
-@@ -1194,16 +1189,18 @@
-    ldr\\t%w0, %1
-    str\\t%w1, %0
-    mov\\t%w0, %w1"
--  [(set_attr "type" "f_mcr,f_mrc,fmov,fconsts,\
--                     f_loads,f_stores,load1,store1,mov_reg")]
-+  [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconsts,\
-+                     f_loads,f_stores,load1,store1,mov_reg")
-+   (set_attr "simd" "yes,*,*,*,*,*,*,*,*,*")]
- )
- 
- (define_insn "*movdf_aarch64"
--  [(set (match_operand:DF 0 "nonimmediate_operand" "=w, ?r,w,w  ,w,m,r,m ,r")
--	(match_operand:DF 1 "general_operand"      "?rY, w,w,Ufc,m,w,m,rY,r"))]
-+  [(set (match_operand:DF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w,m,r,m ,r")
-+	(match_operand:DF 1 "general_operand"      "Y ,?rY, w,w,Ufc,m,w,m,rY,r"))]
-   "TARGET_FLOAT && (register_operand (operands[0], DFmode)
-     || aarch64_reg_or_fp_zero (operands[1], DFmode))"
-   "@
-+   movi\\t%d0, #0
-    fmov\\t%d0, %x1
-    fmov\\t%x0, %d1
-    fmov\\t%d0, %d1
-@@ -1213,8 +1210,9 @@
-    ldr\\t%x0, %1
-    str\\t%x1, %0
-    mov\\t%x0, %x1"
--  [(set_attr "type" "f_mcr,f_mrc,fmov,fconstd,\
--                     f_loadd,f_stored,load1,store1,mov_reg")]
-+  [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconstd,\
-+                     f_loadd,f_stored,load1,store1,mov_reg")
-+   (set_attr "simd" "yes,*,*,*,*,*,*,*,*,*")]
- )
- 
- (define_insn "*movtf_aarch64"
-@@ -1239,7 +1237,6 @@
-   [(set_attr "type" "logic_reg,multiple,f_mcr,f_mrc,neon_move_q,f_mcr,\
-                      f_loadd,f_stored,load2,store2,store2")
-    (set_attr "length" "4,8,8,8,4,4,4,4,4,4,4")
--   (set_attr "fp" "*,*,yes,yes,*,yes,yes,yes,*,*,*")
-    (set_attr "simd" "yes,*,*,*,yes,*,*,*,*,*,*")]
- )
- 
-@@ -1552,10 +1549,10 @@
-         (zero_extend:GPI (match_operand:SHORT 1 "nonimmediate_operand" "r,m,m")))]
-   ""
-   "@
--   uxt<SHORT:size>\t%<GPI:w>0, %w1
-+   and\t%<GPI:w>0, %<GPI:w>1, <SHORT:short_mask>
-    ldr<SHORT:size>\t%w0, %1
-    ldr\t%<SHORT:size>0, %1"
--  [(set_attr "type" "extend,load1,load1")]
-+  [(set_attr "type" "logic_imm,load1,load1")]
- )
- 
- (define_expand "<optab>qihi2"
-@@ -1564,16 +1561,26 @@
-   ""
- )
- 
--(define_insn "*<optab>qihi2_aarch64"
-+(define_insn "*extendqihi2_aarch64"
-   [(set (match_operand:HI 0 "register_operand" "=r,r")
--        (ANY_EXTEND:HI (match_operand:QI 1 "nonimmediate_operand" "r,m")))]
-+	(sign_extend:HI (match_operand:QI 1 "nonimmediate_operand" "r,m")))]
-   ""
-   "@
--   <su>xtb\t%w0, %w1
--   <ldrxt>b\t%w0, %1"
-+   sxtb\t%w0, %w1
-+   ldrsb\t%w0, %1"
-   [(set_attr "type" "extend,load1")]
- )
- 
-+(define_insn "*zero_extendqihi2_aarch64"
-+  [(set (match_operand:HI 0 "register_operand" "=r,r")
-+	(zero_extend:HI (match_operand:QI 1 "nonimmediate_operand" "r,m")))]
-+  ""
-+  "@
-+   and\t%w0, %w1, 255
-+   ldrb\t%w0, %1"
-+  [(set_attr "type" "logic_imm,load1")]
-+)
-+
- ;; -------------------------------------------------------------------
- ;; Simple arithmetic
- ;; -------------------------------------------------------------------
-@@ -1585,25 +1592,16 @@
- 	      (match_operand:GPI 2 "aarch64_pluslong_operand" "")))]
-   ""
- {
--  if (aarch64_pluslong_strict_immedate (operands[2], <MODE>mode))
--    {
--      /* Give CSE the opportunity to share this constant across additions.  */
--      if (!cse_not_expected && can_create_pseudo_p ())
--        operands[2] = force_reg (<MODE>mode, operands[2]);
--
--      /* Split will refuse to operate on a modification to the stack pointer.
--	 Aid the prologue and epilogue expanders by splitting this now.  */
--      else if (reload_completed && operands[0] == stack_pointer_rtx)
--	{
--	  HOST_WIDE_INT i = INTVAL (operands[2]);
--	  HOST_WIDE_INT s = (i >= 0 ? i & 0xfff : -(-i & 0xfff));
--	  emit_insn (gen_rtx_SET (operands[0],
--				  gen_rtx_PLUS (<MODE>mode, operands[1],
--						GEN_INT (i - s))));
--	  operands[1] = operands[0];
--	  operands[2] = GEN_INT (s);
--	}
--    }
-+  /* If operands[1] is a subreg extract the inner RTX.  */
-+  rtx op1 = REG_P (operands[1]) ? operands[1] : SUBREG_REG (operands[1]);
-+
-+  /* If the constant is too large for a single instruction and isn't frame
-+     based, split off the immediate so it is available for CSE.  */
-+  if (!aarch64_plus_immediate (operands[2], <MODE>mode)
-+      && can_create_pseudo_p ()
-+      && (!REG_P (op1)
-+	 || !REGNO_PTR_FRAME_P (REGNO (op1))))
-+    operands[2] = force_reg (<MODE>mode, operands[2]);
- })
- 
- (define_insn "*add<mode>3_aarch64"
-@@ -1765,7 +1763,7 @@
-   "aarch64_zero_extend_const_eq (<DWI>mode, operands[2],
- 				 <MODE>mode, operands[1])"
-   "@
--  cmn\\t%<w>0, %<w>1
-+  cmn\\t%<w>0, %1
-   cmp\\t%<w>0, #%n1"
-   [(set_attr "type" "alus_imm")]
- )
-@@ -1797,11 +1795,11 @@
-   "aarch64_zero_extend_const_eq (<DWI>mode, operands[3],
-                                  <MODE>mode, operands[2])"
-   "@
--  adds\\t%<w>0, %<w>1, %<w>2
-+  adds\\t%<w>0, %<w>1, %2
-   subs\\t%<w>0, %<w>1, #%n2"
-   [(set_attr "type" "alus_imm")]
- )
-- 
-+
- (define_insn "add<mode>3_compareC"
-   [(set (reg:CC_C CC_REGNUM)
- 	(ne:CC_C
-@@ -3404,7 +3402,9 @@
-          (LOGICAL:SI (match_operand:SI 1 "register_operand" "%r,r")
- 		     (match_operand:SI 2 "aarch64_logical_operand" "r,K"))))]
-   ""
--  "<logical>\\t%w0, %w1, %w2"
-+  "@
-+   <logical>\\t%w0, %w1, %w2
-+   <logical>\\t%w0, %w1, %2"
-   [(set_attr "type" "logic_reg,logic_imm")]
- )
- 
-@@ -3417,7 +3417,9 @@
-    (set (match_operand:GPI 0 "register_operand" "=r,r")
- 	(and:GPI (match_dup 1) (match_dup 2)))]
-   ""
--  "ands\\t%<w>0, %<w>1, %<w>2"
-+  "@
-+   ands\\t%<w>0, %<w>1, %<w>2
-+   ands\\t%<w>0, %<w>1, %2"
-   [(set_attr "type" "logics_reg,logics_imm")]
- )
- 
-@@ -3431,7 +3433,9 @@
-    (set (match_operand:DI 0 "register_operand" "=r,r")
- 	(zero_extend:DI (and:SI (match_dup 1) (match_dup 2))))]
-   ""
--  "ands\\t%w0, %w1, %w2"
-+  "@
-+   ands\\t%w0, %w1, %w2
-+   ands\\t%w0, %w1, %2"
-   [(set_attr "type" "logics_reg,logics_imm")]
- )
- 
-@@ -3741,6 +3745,39 @@
-   }
- )
- 
-+;; Pop count be done via the "CNT" instruction in AdvSIMD.
-+;;
-+;; MOV	v.1d, x0
-+;; CNT	v1.8b, v.8b
-+;; ADDV b2, v1.8b
-+;; MOV	w0, v2.b[0]
-+
-+(define_expand "popcount<mode>2"
-+  [(match_operand:GPI 0 "register_operand")
-+   (match_operand:GPI 1 "register_operand")]
-+  "TARGET_SIMD"
-+{
-+  rtx v = gen_reg_rtx (V8QImode);
-+  rtx v1 = gen_reg_rtx (V8QImode);
-+  rtx r = gen_reg_rtx (QImode);
-+  rtx in = operands[1];
-+  rtx out = operands[0];
-+  if(<MODE>mode == SImode)
-+    {
-+      rtx tmp;
-+      tmp = gen_reg_rtx (DImode);
-+      /* If we have SImode, zero extend to DImode, pop count does
-+         not change if we have extra zeros. */
-+      emit_insn (gen_zero_extendsidi2 (tmp, in));
-+      in = tmp;
-+    }
-+  emit_move_insn (v, gen_lowpart (V8QImode, in));
-+  emit_insn (gen_popcountv8qi2 (v1, v));
-+  emit_insn (gen_reduc_plus_scal_v8qi (r, v1));
-+  emit_insn (gen_zero_extendqi<mode>2 (out, r));
-+  DONE;
-+})
-+
- (define_insn "clrsb<mode>2"
-   [(set (match_operand:GPI 0 "register_operand" "=r")
-         (clrsb:GPI (match_operand:GPI 1 "register_operand" "r")))]
-@@ -3757,16 +3794,23 @@
-   [(set_attr "type" "rbit")]
- )
- 
--(define_expand "ctz<mode>2"
--  [(match_operand:GPI 0 "register_operand")
--   (match_operand:GPI 1 "register_operand")]
-+;; Split after reload into RBIT + CLZ.  Since RBIT is represented as an UNSPEC
-+;; it is unlikely to fold with any other operation, so keep this as a CTZ
-+;; expression and split after reload to enable scheduling them apart if
-+;; needed.
-+
-+(define_insn_and_split "ctz<mode>2"
-+ [(set (match_operand:GPI           0 "register_operand" "=r")
-+       (ctz:GPI (match_operand:GPI  1 "register_operand" "r")))]
-   ""
--  {
--    emit_insn (gen_rbit<mode>2 (operands[0], operands[1]));
--    emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
--    DONE;
--  }
--)
-+  "#"
-+  "reload_completed"
-+  [(const_int 0)]
-+  "
-+  emit_insn (gen_rbit<mode>2 (operands[0], operands[1]));
-+  emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
-+  DONE;
-+")
- 
- (define_insn "*and<mode>_compare0"
-   [(set (reg:CC_NZ CC_REGNUM)
-@@ -3778,6 +3822,18 @@
-   [(set_attr "type" "alus_imm")]
- )
- 
-+(define_insn "*ands<mode>_compare0"
-+  [(set (reg:CC_NZ CC_REGNUM)
-+	(compare:CC_NZ
-+	 (zero_extend:GPI (match_operand:SHORT 1 "register_operand" "r"))
-+	 (const_int 0)))
-+   (set (match_operand:GPI 0 "register_operand" "=r")
-+	(zero_extend:GPI (match_dup 1)))]
-+  ""
-+  "ands\\t%<GPI:w>0, %<GPI:w>1, <short_mask>"
-+  [(set_attr "type" "alus_imm")]
-+)
-+
- (define_insn "*and<mode>3nr_compare0"
-   [(set (reg:CC_NZ CC_REGNUM)
- 	(compare:CC_NZ
-@@ -3785,7 +3841,9 @@
- 		  (match_operand:GPI 1 "aarch64_logical_operand" "r,<lconst>"))
- 	 (const_int 0)))]
-   ""
--  "tst\\t%<w>0, %<w>1"
-+  "@
-+   tst\\t%<w>0, %<w>1
-+   tst\\t%<w>0, %1"
-   [(set_attr "type" "logics_reg,logics_imm")]
- )
- 
-@@ -3851,22 +3909,16 @@
- (define_expand "ashl<mode>3"
-   [(set (match_operand:SHORT 0 "register_operand")
- 	(ashift:SHORT (match_operand:SHORT 1 "register_operand")
--		      (match_operand:QI 2 "nonmemory_operand")))]
-+		      (match_operand:QI 2 "const_int_operand")))]
-   ""
-   {
--    if (CONST_INT_P (operands[2]))
--      {
--        operands[2] = GEN_INT (INTVAL (operands[2])
--                               & (GET_MODE_BITSIZE (<MODE>mode) - 1));
-+    operands[2] = GEN_INT (INTVAL (operands[2]) & GET_MODE_MASK (<MODE>mode));
- 
--        if (operands[2] == const0_rtx)
--          {
--	    emit_insn (gen_mov<mode> (operands[0], operands[1]));
--	    DONE;
--          }
-+    if (operands[2] == const0_rtx)
-+      {
-+	emit_insn (gen_mov<mode> (operands[0], operands[1]));
-+	DONE;
-       }
--    else
--      FAIL;
-   }
- )
- 
-@@ -3915,33 +3967,35 @@
- 
- ;; Logical left shift using SISD or Integer instruction
- (define_insn "*aarch64_ashl_sisd_or_int_<mode>3"
--  [(set (match_operand:GPI 0 "register_operand" "=r,w,w")
--        (ashift:GPI
--          (match_operand:GPI 1 "register_operand" "r,w,w")
--          (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "rUs<cmode>,Us<cmode>,w")))]
-+  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,w")
-+	(ashift:GPI
-+	  (match_operand:GPI 1 "register_operand" "r,r,w,w")
-+	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "Us<cmode>,r,Us<cmode>,w")))]
-   ""
-   "@
-+   lsl\t%<w>0, %<w>1, %2
-    lsl\t%<w>0, %<w>1, %<w>2
-    shl\t%<rtn>0<vas>, %<rtn>1<vas>, %2
-    ushl\t%<rtn>0<vas>, %<rtn>1<vas>, %<rtn>2<vas>"
--  [(set_attr "simd" "no,yes,yes")
--   (set_attr "type" "shift_reg,neon_shift_imm<q>, neon_shift_reg<q>")]
-+  [(set_attr "simd" "no,no,yes,yes")
-+   (set_attr "type" "bfx,shift_reg,neon_shift_imm<q>, neon_shift_reg<q>")]
- )
- 
- ;; Logical right shift using SISD or Integer instruction
- (define_insn "*aarch64_lshr_sisd_or_int_<mode>3"
--  [(set (match_operand:GPI 0 "register_operand" "=r,w,&w,&w")
--        (lshiftrt:GPI
--          (match_operand:GPI 1 "register_operand" "r,w,w,w")
--          (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "rUs<cmode>,Us<cmode>,w,0")))]
-+  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,&w,&w")
-+	(lshiftrt:GPI
-+	 (match_operand:GPI 1 "register_operand" "r,r,w,w,w")
-+	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "Us<cmode>,r,Us<cmode>,w,0")))]
-   ""
-   "@
-+   lsr\t%<w>0, %<w>1, %2
-    lsr\t%<w>0, %<w>1, %<w>2
-    ushr\t%<rtn>0<vas>, %<rtn>1<vas>, %2
-    #
-    #"
--  [(set_attr "simd" "no,yes,yes,yes")
--   (set_attr "type" "shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")]
-+  [(set_attr "simd" "no,no,yes,yes,yes")
-+   (set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")]
- )
- 
- (define_split
-@@ -3976,18 +4030,19 @@
- 
- ;; Arithmetic right shift using SISD or Integer instruction
- (define_insn "*aarch64_ashr_sisd_or_int_<mode>3"
--  [(set (match_operand:GPI 0 "register_operand" "=r,w,&w,&w")
-+  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,&w,&w")
-         (ashiftrt:GPI
--          (match_operand:GPI 1 "register_operand" "r,w,w,w")
--          (match_operand:QI 2 "aarch64_reg_or_shift_imm_di" "rUs<cmode>,Us<cmode>,w,0")))]
-+          (match_operand:GPI 1 "register_operand" "r,r,w,w,w")
-+          (match_operand:QI 2 "aarch64_reg_or_shift_imm_di" "Us<cmode>,r,Us<cmode>,w,0")))]
-   ""
-   "@
-+   asr\t%<w>0, %<w>1, %2
-    asr\t%<w>0, %<w>1, %<w>2
-    sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2
-    #
-    #"
--  [(set_attr "simd" "no,yes,yes,yes")
--   (set_attr "type" "shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")]
-+  [(set_attr "simd" "no,no,yes,yes,yes")
-+   (set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")]
- )
- 
- (define_split
-@@ -4079,21 +4134,25 @@
-   [(set (match_operand:GPI 0 "register_operand" "=r,r")
-      (rotatert:GPI
-        (match_operand:GPI 1 "register_operand" "r,r")
--       (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "r,Us<cmode>")))]
-+       (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "Us<cmode>,r")))]
-   ""
--  "ror\\t%<w>0, %<w>1, %<w>2"
--  [(set_attr "type" "shift_reg, rotate_imm")]
-+  "@
-+   ror\\t%<w>0, %<w>1, %2
-+   ror\\t%<w>0, %<w>1, %<w>2"
-+  [(set_attr "type" "rotate_imm,shift_reg")]
- )
- 
- ;; zero_extend version of above
- (define_insn "*<optab>si3_insn_uxtw"
--  [(set (match_operand:DI 0 "register_operand" "=r")
-+  [(set (match_operand:DI 0 "register_operand" "=r,r")
- 	(zero_extend:DI (SHIFT:SI
--	 (match_operand:SI 1 "register_operand" "r")
--	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "rUss"))))]
-+	 (match_operand:SI 1 "register_operand" "r,r")
-+	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))]
-   ""
--  "<shift>\\t%w0, %w1, %w2"
--  [(set_attr "type" "shift_reg")]
-+  "@
-+   <shift>\\t%w0, %w1, %2
-+   <shift>\\t%w0, %w1, %w2"
-+  [(set_attr "type" "bfx,shift_reg")]
- )
- 
- (define_insn "*<optab><mode>3_insn"
-@@ -4105,7 +4164,7 @@
-   operands[3] = GEN_INT (<sizen> - UINTVAL (operands[2]));
-   return "<bfshift>\t%w0, %w1, %2, %3";
- }
--  [(set_attr "type" "bfm")]
-+  [(set_attr "type" "bfx")]
- )
- 
- (define_insn "*extr<mode>5_insn"
-@@ -4117,7 +4176,7 @@
-   "UINTVAL (operands[3]) < GET_MODE_BITSIZE (<MODE>mode) &&
-    (UINTVAL (operands[3]) + UINTVAL (operands[4]) == GET_MODE_BITSIZE (<MODE>mode))"
-   "extr\\t%<w>0, %<w>1, %<w>2, %4"
--  [(set_attr "type" "shift_imm")]
-+  [(set_attr "type" "rotate_imm")]
- )
- 
- ;; There are no canonicalisation rules for ashift and lshiftrt inside an ior
-@@ -4132,7 +4191,7 @@
-    && (UINTVAL (operands[3]) + UINTVAL (operands[4])
-        == GET_MODE_BITSIZE (<MODE>mode))"
-   "extr\\t%<w>0, %<w>1, %<w>2, %4"
--  [(set_attr "type" "shift_imm")]
-+  [(set_attr "type" "rotate_imm")]
- )
- 
- ;; zero_extend version of the above
-@@ -4146,7 +4205,7 @@
-   "UINTVAL (operands[3]) < 32 &&
-    (UINTVAL (operands[3]) + UINTVAL (operands[4]) == 32)"
-   "extr\\t%w0, %w1, %w2, %4"
--  [(set_attr "type" "shift_imm")]
-+  [(set_attr "type" "rotate_imm")]
- )
- 
- (define_insn "*extrsi5_insn_uxtw_alt"
-@@ -4159,7 +4218,7 @@
-   "UINTVAL (operands[3]) < 32 &&
-    (UINTVAL (operands[3]) + UINTVAL (operands[4]) == 32)"
-   "extr\\t%w0, %w1, %w2, %4"
--  [(set_attr "type" "shift_imm")]
-+  [(set_attr "type" "rotate_imm")]
- )
- 
- (define_insn "*ror<mode>3_insn"
-@@ -4198,7 +4257,7 @@
-   operands[3] = GEN_INT (<SHORT:sizen> - UINTVAL (operands[2]));
-   return "<su>bfiz\t%<GPI:w>0, %<GPI:w>1, %2, %3";
- }
--  [(set_attr "type" "bfm")]
-+  [(set_attr "type" "bfx")]
- )
- 
- (define_insn "*zero_extend<GPI:mode>_lshr<SHORT:mode>"
-@@ -4211,7 +4270,7 @@
-   operands[3] = GEN_INT (<SHORT:sizen> - UINTVAL (operands[2]));
-   return "ubfx\t%<GPI:w>0, %<GPI:w>1, %2, %3";
- }
--  [(set_attr "type" "bfm")]
-+  [(set_attr "type" "bfx")]
- )
- 
- (define_insn "*extend<GPI:mode>_ashr<SHORT:mode>"
-@@ -4224,7 +4283,7 @@
-   operands[3] = GEN_INT (<SHORT:sizen> - UINTVAL (operands[2]));
-   return "sbfx\\t%<GPI:w>0, %<GPI:w>1, %2, %3";
- }
--  [(set_attr "type" "bfm")]
-+  [(set_attr "type" "bfx")]
- )
- 
- ;; -------------------------------------------------------------------
-@@ -4256,7 +4315,27 @@
-   "IN_RANGE (INTVAL (operands[2]) + INTVAL (operands[3]),
- 	     1, GET_MODE_BITSIZE (<MODE>mode) - 1)"
-   "<su>bfx\\t%<w>0, %<w>1, %3, %2"
--  [(set_attr "type" "bfm")]
-+  [(set_attr "type" "bfx")]
-+)
-+
-+;; When the bit position and width add up to 32 we can use a W-reg LSR
-+;; instruction taking advantage of the implicit zero-extension of the X-reg.
-+(define_split
-+  [(set (match_operand:DI 0 "register_operand")
-+	(zero_extract:DI (match_operand:DI 1 "register_operand")
-+			 (match_operand 2
-+			   "aarch64_simd_shift_imm_offset_di")
-+			 (match_operand 3
-+			   "aarch64_simd_shift_imm_di")))]
-+  "IN_RANGE (INTVAL (operands[2]) + INTVAL (operands[3]), 1,
-+	     GET_MODE_BITSIZE (DImode) - 1)
-+   && (INTVAL (operands[2]) + INTVAL (operands[3]))
-+       == GET_MODE_BITSIZE (SImode)"
-+  [(set (match_dup 0)
-+	(zero_extend:DI (lshiftrt:SI (match_dup 4) (match_dup 3))))]
-+  {
-+    operands[4] = gen_lowpart (SImode, operands[1]);
-+  }
- )
- 
- ;; Bitfield Insert (insv)
-@@ -4338,7 +4417,7 @@
- 	      : GEN_INT (<GPI:sizen> - UINTVAL (operands[2]));
-   return "<su>bfiz\t%<GPI:w>0, %<GPI:w>1, %2, %3";
- }
--  [(set_attr "type" "bfm")]
-+  [(set_attr "type" "bfx")]
- )
- 
- ;; XXX We should match (any_extend (ashift)) here, like (and (ashift)) below
-@@ -4348,11 +4427,27 @@
- 	(and:GPI (ashift:GPI (match_operand:GPI 1 "register_operand" "r")
- 			     (match_operand 2 "const_int_operand" "n"))
- 		 (match_operand 3 "const_int_operand" "n")))]
--  "(INTVAL (operands[2]) < (<GPI:sizen>))
--   && exact_log2 ((INTVAL (operands[3]) >> INTVAL (operands[2])) + 1) >= 0
--   && (INTVAL (operands[3]) & ((1 << INTVAL (operands[2])) - 1)) == 0"
-+  "aarch64_mask_and_shift_for_ubfiz_p (<MODE>mode, operands[3], operands[2])"
-   "ubfiz\\t%<w>0, %<w>1, %2, %P3"
--  [(set_attr "type" "bfm")]
-+  [(set_attr "type" "bfx")]
-+)
-+
-+;; When the bit position and width of the equivalent extraction add up to 32
-+;; we can use a W-reg LSL instruction taking advantage of the implicit
-+;; zero-extension of the X-reg.
-+(define_split
-+  [(set (match_operand:DI 0 "register_operand")
-+	(and:DI (ashift:DI (match_operand:DI 1 "register_operand")
-+			     (match_operand 2 "const_int_operand"))
-+		 (match_operand 3 "const_int_operand")))]
-+ "aarch64_mask_and_shift_for_ubfiz_p (DImode, operands[3], operands[2])
-+  && (INTVAL (operands[2]) + popcount_hwi (INTVAL (operands[3])))
-+      == GET_MODE_BITSIZE (SImode)"
-+  [(set (match_dup 0)
-+	(zero_extend:DI (ashift:SI (match_dup 4) (match_dup 2))))]
-+  {
-+    operands[4] = gen_lowpart (SImode, operands[1]);
-+  }
- )
- 
- (define_insn "bswap<mode>2"
-@@ -4420,22 +4515,23 @@
- ;; Expands to btrunc, ceil, floor, nearbyint, rint, round, frintn.
- 
- (define_insn "<frint_pattern><mode>2"
--  [(set (match_operand:GPF 0 "register_operand" "=w")
--	(unspec:GPF [(match_operand:GPF 1 "register_operand" "w")]
-+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
-+	(unspec:GPF_F16 [(match_operand:GPF_F16 1 "register_operand" "w")]
- 	 FRINT))]
-   "TARGET_FLOAT"
-   "frint<frint_suffix>\\t%<s>0, %<s>1"
--  [(set_attr "type" "f_rint<s>")]
-+  [(set_attr "type" "f_rint<stype>")]
- )
- 
- ;; frcvt floating-point round to integer and convert standard patterns.
- ;; Expands to lbtrunc, lceil, lfloor, lround.
--(define_insn "l<fcvt_pattern><su_optab><GPF:mode><GPI:mode>2"
-+(define_insn "l<fcvt_pattern><su_optab><GPF_F16:mode><GPI:mode>2"
-   [(set (match_operand:GPI 0 "register_operand" "=r")
--	(FIXUORS:GPI (unspec:GPF [(match_operand:GPF 1 "register_operand" "w")]
--		      FCVT)))]
-+	(FIXUORS:GPI
-+	  (unspec:GPF_F16 [(match_operand:GPF_F16 1 "register_operand" "w")]
-+	   FCVT)))]
-   "TARGET_FLOAT"
--  "fcvt<frint_suffix><su>\\t%<GPI:w>0, %<GPF:s>1"
-+  "fcvt<frint_suffix><su>\\t%<GPI:w>0, %<GPF_F16:s>1"
-   [(set_attr "type" "f_cvtf2i")]
- )
- 
-@@ -4461,23 +4557,24 @@
- ;; fma - no throw
- 
- (define_insn "fma<mode>4"
--  [(set (match_operand:GPF 0 "register_operand" "=w")
--        (fma:GPF (match_operand:GPF 1 "register_operand" "w")
--		 (match_operand:GPF 2 "register_operand" "w")
--		 (match_operand:GPF 3 "register_operand" "w")))]
-+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
-+        (fma:GPF_F16 (match_operand:GPF_F16 1 "register_operand" "w")
-+		     (match_operand:GPF_F16 2 "register_operand" "w")
-+		     (match_operand:GPF_F16 3 "register_operand" "w")))]
-   "TARGET_FLOAT"
-   "fmadd\\t%<s>0, %<s>1, %<s>2, %<s>3"
--  [(set_attr "type" "fmac<s>")]
-+  [(set_attr "type" "fmac<stype>")]
- )
- 
- (define_insn "fnma<mode>4"
--  [(set (match_operand:GPF 0 "register_operand" "=w")
--	(fma:GPF (neg:GPF (match_operand:GPF 1 "register_operand" "w"))
--		 (match_operand:GPF 2 "register_operand" "w")
--		 (match_operand:GPF 3 "register_operand" "w")))]
-+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
-+	(fma:GPF_F16
-+	  (neg:GPF_F16 (match_operand:GPF_F16 1 "register_operand" "w"))
-+	  (match_operand:GPF_F16 2 "register_operand" "w")
-+	  (match_operand:GPF_F16 3 "register_operand" "w")))]
-   "TARGET_FLOAT"
-   "fmsub\\t%<s>0, %<s>1, %<s>2, %<s>3"
--  [(set_attr "type" "fmac<s>")]
-+  [(set_attr "type" "fmac<stype>")]
- )
- 
- (define_insn "fms<mode>4"
-@@ -4563,19 +4660,11 @@
-   [(set_attr "type" "f_cvt")]
- )
- 
--(define_insn "fix_trunc<GPF:mode><GPI:mode>2"
--  [(set (match_operand:GPI 0 "register_operand" "=r")
--        (fix:GPI (match_operand:GPF 1 "register_operand" "w")))]
--  "TARGET_FLOAT"
--  "fcvtzs\\t%<GPI:w>0, %<GPF:s>1"
--  [(set_attr "type" "f_cvtf2i")]
--)
--
--(define_insn "fixuns_trunc<GPF:mode><GPI:mode>2"
-+(define_insn "<optab>_trunc<GPF_F16:mode><GPI:mode>2"
-   [(set (match_operand:GPI 0 "register_operand" "=r")
--        (unsigned_fix:GPI (match_operand:GPF 1 "register_operand" "w")))]
-+	(FIXUORS:GPI (match_operand:GPF_F16 1 "register_operand" "w")))]
-   "TARGET_FLOAT"
--  "fcvtzu\\t%<GPI:w>0, %<GPF:s>1"
-+  "fcvtz<su>\t%<GPI:w>0, %<GPF_F16:s>1"
-   [(set_attr "type" "f_cvtf2i")]
- )
- 
-@@ -4599,38 +4688,116 @@
-   [(set_attr "type" "f_cvti2f")]
- )
- 
-+(define_insn "<optab><mode>hf2"
-+  [(set (match_operand:HF 0 "register_operand" "=w")
-+	(FLOATUORS:HF (match_operand:GPI 1 "register_operand" "r")))]
-+  "TARGET_FP_F16INST"
-+  "<su_optab>cvtf\t%h0, %<w>1"
-+  [(set_attr "type" "f_cvti2f")]
-+)
-+
-+;; Convert between fixed-point and floating-point (scalar modes)
-+
-+(define_insn "<FCVT_F2FIXED:fcvt_fixed_insn><GPF:mode>3"
-+  [(set (match_operand:<GPF:FCVT_TARGET> 0 "register_operand" "=r, w")
-+	(unspec:<GPF:FCVT_TARGET> [(match_operand:GPF 1 "register_operand" "w, w")
-+				   (match_operand:SI 2 "immediate_operand" "i, i")]
-+	 FCVT_F2FIXED))]
-+  ""
-+  "@
-+   <FCVT_F2FIXED:fcvt_fixed_insn>\t%<GPF:w1>0, %<GPF:s>1, #%2
-+   <FCVT_F2FIXED:fcvt_fixed_insn>\t%<GPF:s>0, %<GPF:s>1, #%2"
-+  [(set_attr "type" "f_cvtf2i, neon_fp_to_int_<GPF:Vetype>")
-+   (set_attr "fp" "yes, *")
-+   (set_attr "simd" "*, yes")]
-+)
-+
-+(define_insn "<FCVT_FIXED2F:fcvt_fixed_insn><GPI:mode>3"
-+  [(set (match_operand:<GPI:FCVT_TARGET> 0 "register_operand" "=w, w")
-+	(unspec:<GPI:FCVT_TARGET> [(match_operand:GPI 1 "register_operand" "r, w")
-+				   (match_operand:SI 2 "immediate_operand" "i, i")]
-+	 FCVT_FIXED2F))]
-+  ""
-+  "@
-+   <FCVT_FIXED2F:fcvt_fixed_insn>\t%<GPI:v>0, %<GPI:w>1, #%2
-+   <FCVT_FIXED2F:fcvt_fixed_insn>\t%<GPI:v>0, %<GPI:v>1, #%2"
-+  [(set_attr "type" "f_cvti2f, neon_int_to_fp_<GPI:Vetype>")
-+   (set_attr "fp" "yes, *")
-+   (set_attr "simd" "*, yes")]
-+)
-+
-+(define_insn "<FCVT_F2FIXED:fcvt_fixed_insn>hf<mode>3"
-+  [(set (match_operand:GPI 0 "register_operand" "=r")
-+	(unspec:GPI [(match_operand:HF 1 "register_operand" "w")
-+		     (match_operand:SI 2 "immediate_operand" "i")]
-+	 FCVT_F2FIXED))]
-+  "TARGET_FP_F16INST"
-+   "<FCVT_F2FIXED:fcvt_fixed_insn>\t%<GPI:w>0, %h1, #%2"
-+  [(set_attr "type" "f_cvtf2i")]
-+)
-+
-+(define_insn "<FCVT_FIXED2F:fcvt_fixed_insn><mode>hf3"
-+  [(set (match_operand:HF 0 "register_operand" "=w")
-+	(unspec:HF [(match_operand:GPI 1 "register_operand" "r")
-+		    (match_operand:SI 2 "immediate_operand" "i")]
-+	 FCVT_FIXED2F))]
-+  "TARGET_FP_F16INST"
-+  "<FCVT_FIXED2F:fcvt_fixed_insn>\t%h0, %<GPI:w>1, #%2"
-+  [(set_attr "type" "f_cvti2f")]
-+)
-+
-+(define_insn "<FCVT_F2FIXED:fcvt_fixed_insn>hf3"
-+  [(set (match_operand:HI 0 "register_operand" "=w")
-+	(unspec:HI [(match_operand:HF 1 "register_operand" "w")
-+		    (match_operand:SI 2 "immediate_operand" "i")]
-+	 FCVT_F2FIXED))]
-+  "TARGET_SIMD"
-+  "<FCVT_F2FIXED:fcvt_fixed_insn>\t%h0, %h1, #%2"
-+  [(set_attr "type" "neon_fp_to_int_s")]
-+)
-+
-+(define_insn "<FCVT_FIXED2F:fcvt_fixed_insn>hi3"
-+  [(set (match_operand:HF 0 "register_operand" "=w")
-+	(unspec:HF [(match_operand:HI 1 "register_operand" "w")
-+		    (match_operand:SI 2 "immediate_operand" "i")]
-+	 FCVT_FIXED2F))]
-+  "TARGET_SIMD"
-+  "<FCVT_FIXED2F:fcvt_fixed_insn>\t%h0, %h1, #%2"
-+  [(set_attr "type" "neon_int_to_fp_s")]
-+)
-+
- ;; -------------------------------------------------------------------
- ;; Floating-point arithmetic
- ;; -------------------------------------------------------------------
- 
- (define_insn "add<mode>3"
--  [(set (match_operand:GPF 0 "register_operand" "=w")
--        (plus:GPF
--         (match_operand:GPF 1 "register_operand" "w")
--         (match_operand:GPF 2 "register_operand" "w")))]
-+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
-+	(plus:GPF_F16
-+	 (match_operand:GPF_F16 1 "register_operand" "w")
-+	 (match_operand:GPF_F16 2 "register_operand" "w")))]
-   "TARGET_FLOAT"
-   "fadd\\t%<s>0, %<s>1, %<s>2"
--  [(set_attr "type" "fadd<s>")]
-+  [(set_attr "type" "fadd<stype>")]
- )
- 
- (define_insn "sub<mode>3"
--  [(set (match_operand:GPF 0 "register_operand" "=w")
--        (minus:GPF
--         (match_operand:GPF 1 "register_operand" "w")
--         (match_operand:GPF 2 "register_operand" "w")))]
-+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
-+	(minus:GPF_F16
-+	 (match_operand:GPF_F16 1 "register_operand" "w")
-+	 (match_operand:GPF_F16 2 "register_operand" "w")))]
-   "TARGET_FLOAT"
-   "fsub\\t%<s>0, %<s>1, %<s>2"
--  [(set_attr "type" "fadd<s>")]
-+  [(set_attr "type" "fadd<stype>")]
- )
- 
- (define_insn "mul<mode>3"
--  [(set (match_operand:GPF 0 "register_operand" "=w")
--        (mult:GPF
--         (match_operand:GPF 1 "register_operand" "w")
--         (match_operand:GPF 2 "register_operand" "w")))]
-+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
-+	(mult:GPF_F16
-+	 (match_operand:GPF_F16 1 "register_operand" "w")
-+	 (match_operand:GPF_F16 2 "register_operand" "w")))]
-   "TARGET_FLOAT"
-   "fmul\\t%<s>0, %<s>1, %<s>2"
--  [(set_attr "type" "fmul<s>")]
-+  [(set_attr "type" "fmul<stype>")]
- )
- 
- (define_insn "*fnmul<mode>3"
-@@ -4653,38 +4820,58 @@
-   [(set_attr "type" "fmul<s>")]
- )
- 
--(define_insn "div<mode>3"
--  [(set (match_operand:GPF 0 "register_operand" "=w")
--        (div:GPF
--         (match_operand:GPF 1 "register_operand" "w")
--         (match_operand:GPF 2 "register_operand" "w")))]
-+(define_expand "div<mode>3"
-+ [(set (match_operand:GPF_F16 0 "register_operand")
-+       (div:GPF_F16 (match_operand:GPF_F16 1 "general_operand")
-+		    (match_operand:GPF_F16 2 "register_operand")))]
-+ "TARGET_SIMD"
-+{
-+  if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
-+    DONE;
-+
-+  operands[1] = force_reg (<MODE>mode, operands[1]);
-+})
-+
-+(define_insn "*div<mode>3"
-+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
-+	(div:GPF_F16 (match_operand:GPF_F16 1 "register_operand" "w")
-+		     (match_operand:GPF_F16 2 "register_operand" "w")))]
-   "TARGET_FLOAT"
-   "fdiv\\t%<s>0, %<s>1, %<s>2"
--  [(set_attr "type" "fdiv<s>")]
-+  [(set_attr "type" "fdiv<stype>")]
- )
- 
- (define_insn "neg<mode>2"
--  [(set (match_operand:GPF 0 "register_operand" "=w")
--        (neg:GPF (match_operand:GPF 1 "register_operand" "w")))]
-+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
-+	(neg:GPF_F16 (match_operand:GPF_F16 1 "register_operand" "w")))]
-   "TARGET_FLOAT"
-   "fneg\\t%<s>0, %<s>1"
--  [(set_attr "type" "ffarith<s>")]
-+  [(set_attr "type" "ffarith<stype>")]
- )
- 
--(define_insn "sqrt<mode>2"
--  [(set (match_operand:GPF 0 "register_operand" "=w")
--        (sqrt:GPF (match_operand:GPF 1 "register_operand" "w")))]
-+(define_expand "sqrt<mode>2"
-+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
-+	(sqrt:GPF_F16 (match_operand:GPF_F16 1 "register_operand" "w")))]
-+  "TARGET_FLOAT"
-+{
-+  if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
-+    DONE;
-+})
-+
-+(define_insn "*sqrt<mode>2"
-+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
-+	(sqrt:GPF_F16 (match_operand:GPF_F16 1 "register_operand" "w")))]
-   "TARGET_FLOAT"
-   "fsqrt\\t%<s>0, %<s>1"
--  [(set_attr "type" "fsqrt<s>")]
-+  [(set_attr "type" "fsqrt<stype>")]
- )
- 
- (define_insn "abs<mode>2"
--  [(set (match_operand:GPF 0 "register_operand" "=w")
--        (abs:GPF (match_operand:GPF 1 "register_operand" "w")))]
-+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
-+	(abs:GPF_F16 (match_operand:GPF_F16 1 "register_operand" "w")))]
-   "TARGET_FLOAT"
-   "fabs\\t%<s>0, %<s>1"
--  [(set_attr "type" "ffarith<s>")]
-+  [(set_attr "type" "ffarith<stype>")]
- )
- 
- ;; Given that smax/smin do not specify the result when either input is NaN,
-@@ -4709,15 +4896,17 @@
-   [(set_attr "type" "f_minmax<s>")]
- )
- 
--;; Scalar forms for the IEEE-754 fmax()/fmin() functions
--(define_insn "<fmaxmin><mode>3"
--  [(set (match_operand:GPF 0 "register_operand" "=w")
--	(unspec:GPF [(match_operand:GPF 1 "register_operand" "w")
--		     (match_operand:GPF 2 "register_operand" "w")]
--		     FMAXMIN))]
-+;; Scalar forms for fmax, fmin, fmaxnm, fminnm.
-+;; fmaxnm and fminnm are used for the fmax<mode>3 standard pattern names,
-+;; which implement the IEEE fmax ()/fmin () functions.
-+(define_insn "<maxmin_uns><mode>3"
-+  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
-+	(unspec:GPF_F16 [(match_operand:GPF_F16 1 "register_operand" "w")
-+		     (match_operand:GPF_F16 2 "register_operand" "w")]
-+		     FMAXMIN_UNS))]
-   "TARGET_FLOAT"
--  "<fmaxmin_op>\\t%<s>0, %<s>1, %<s>2"
--  [(set_attr "type" "f_minmax<s>")]
-+  "<maxmin_uns_op>\\t%<s>0, %<s>1, %<s>2"
-+  [(set_attr "type" "f_minmax<stype>")]
- )
- 
- ;; For copysign (x, y), we want to generate:
-@@ -4775,7 +4964,7 @@
-  [(set (match_operand:GPF_TF 0 "register_operand" "=w")
-        (mem:GPF_TF (match_operand 1 "aarch64_constant_pool_symref" "S")))
-   (clobber (match_operand:P 2 "register_operand" "=&r"))]
-- "TARGET_FLOAT && aarch64_nopcrelative_literal_loads"
-+ "TARGET_FLOAT"
-  {
-    aarch64_expand_mov_immediate (operands[2], XEXP (operands[1], 0));
-    emit_move_insn (operands[0], gen_rtx_MEM (<GPF_TF:MODE>mode, operands[2]));
-@@ -4788,7 +4977,7 @@
-  [(set (match_operand:VALL 0 "register_operand" "=w")
-        (mem:VALL (match_operand 1 "aarch64_constant_pool_symref" "S")))
-   (clobber (match_operand:P 2 "register_operand" "=&r"))]
-- "TARGET_FLOAT && aarch64_nopcrelative_literal_loads"
-+ "TARGET_FLOAT"
-  {
-    aarch64_expand_mov_immediate (operands[2], XEXP (operands[1], 0));
-    emit_move_insn (operands[0], gen_rtx_MEM (<VALL:MODE>mode, operands[2]));
-@@ -4961,20 +5150,20 @@
- ;; The TLS ABI specifically requires that the compiler does not schedule
- ;; instructions in the TLS stubs, in order to enable linker relaxation.
- ;; Therefore we treat the stubs as an atomic sequence.
--(define_expand "tlsgd_small"
-+(define_expand "tlsgd_small_<mode>"
-  [(parallel [(set (match_operand 0 "register_operand" "")
-                   (call (mem:DI (match_dup 2)) (const_int 1)))
--	     (unspec:DI [(match_operand:DI 1 "aarch64_valid_symref" "")] UNSPEC_GOTSMALLTLS)
-+	     (unspec:DI [(match_operand:PTR 1 "aarch64_valid_symref" "")] UNSPEC_GOTSMALLTLS)
- 	     (clobber (reg:DI LR_REGNUM))])]
-  ""
- {
-   operands[2] = aarch64_tls_get_addr ();
- })
- 
--(define_insn "*tlsgd_small"
-+(define_insn "*tlsgd_small_<mode>"
-   [(set (match_operand 0 "register_operand" "")
- 	(call (mem:DI (match_operand:DI 2 "" "")) (const_int 1)))
--   (unspec:DI [(match_operand:DI 1 "aarch64_valid_symref" "S")] UNSPEC_GOTSMALLTLS)
-+   (unspec:DI [(match_operand:PTR 1 "aarch64_valid_symref" "S")] UNSPEC_GOTSMALLTLS)
-    (clobber (reg:DI LR_REGNUM))
-   ]
-   ""
-@@ -5182,7 +5371,7 @@
- 	 UNSPEC_SP_TEST))
-    (clobber (match_scratch:PTR 3 "=&r"))]
-   ""
--  "ldr\t%<w>3, %x1\;ldr\t%<w>0, %x2\;eor\t%<w>0, %<w>3, %<w>0"
-+  "ldr\t%<w>3, %1\;ldr\t%<w>0, %2\;eor\t%<w>0, %<w>3, %<w>0"
-   [(set_attr "length" "12")
-    (set_attr "type" "multiple")])
- 
---- a/src/gcc/config/aarch64/aarch64.opt
-+++ b/src/gcc/config/aarch64/aarch64.opt
-@@ -146,10 +146,28 @@ EnumValue
- Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64)
- 
- mpc-relative-literal-loads
--Target Report Save Var(nopcrelative_literal_loads) Init(2) Save
-+Target Report Save Var(pcrelative_literal_loads) Init(2) Save
- PC relative literal loads.
- 
- mlow-precision-recip-sqrt
- Common Var(flag_mrecip_low_precision_sqrt) Optimization
--When calculating the reciprocal square root approximation,
--uses one less step than otherwise, thus reducing latency and precision.
-+Enable the reciprocal square root approximation.  Enabling this reduces
-+precision of reciprocal square root results to about 16 bits for
-+single precision and to 32 bits for double precision.
-+
-+mlow-precision-sqrt
-+Common Var(flag_mlow_precision_sqrt) Optimization
-+Enable the square root approximation.  Enabling this reduces
-+precision of square root results to about 16 bits for
-+single precision and to 32 bits for double precision.
-+If enabled, it implies -mlow-precision-recip-sqrt.
-+
-+mlow-precision-div
-+Common Var(flag_mlow_precision_div) Optimization
-+Enable the division approximation.  Enabling this reduces
-+precision of division results to about 16 bits for
-+single precision and to 32 bits for double precision.
-+
-+mverbose-cost-dump
-+Common Undocumented Var(flag_aarch64_verbose_cost)
-+Enables verbose cost model dummping in the debug dump files.
---- /dev/null
-+++ b/src/gcc/config/aarch64/arm_fp16.h
-@@ -0,0 +1,579 @@
-+/* ARM FP16 scalar intrinsics include file.
-+
-+   Copyright (C) 2016 Free Software Foundation, Inc.
-+   Contributed by ARM Ltd.
-+
-+   This file is part of GCC.
-+
-+   GCC is free software; you can redistribute it and/or modify it
-+   under the terms of the GNU General Public License as published
-+   by the Free Software Foundation; either version 3, or (at your
-+   option) any later version.
-+
-+   GCC is distributed in the hope that it will be useful, but WITHOUT
-+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-+   License for more details.
-+
-+   Under Section 7 of GPL version 3, you are granted additional
-+   permissions described in the GCC Runtime Library Exception, version
-+   3.1, as published by the Free Software Foundation.
-+
-+   You should have received a copy of the GNU General Public License and
-+   a copy of the GCC Runtime Library Exception along with this program;
-+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef _AARCH64_FP16_H_
-+#define _AARCH64_FP16_H_
-+
-+#include <stdint.h>
-+
-+#pragma GCC push_options
-+#pragma GCC target ("arch=armv8.2-a+fp16")
-+
-+typedef __fp16 float16_t;
-+
-+/* ARMv8.2-A FP16 one operand scalar intrinsics.  */
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vabsh_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_abshf (__a);
-+}
-+
-+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+vceqzh_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_cmeqhf_uss (__a, 0.0f);
-+}
-+
-+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+vcgezh_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_cmgehf_uss (__a, 0.0f);
-+}
-+
-+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+vcgtzh_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_cmgthf_uss (__a, 0.0f);
-+}
-+
-+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+vclezh_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_cmlehf_uss (__a, 0.0f);
-+}
-+
-+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+vcltzh_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_cmlthf_uss (__a, 0.0f);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vcvth_f16_s16 (int16_t __a)
-+{
-+  return __builtin_aarch64_floathihf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vcvth_f16_s32 (int32_t __a)
-+{
-+  return __builtin_aarch64_floatsihf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vcvth_f16_s64 (int64_t __a)
-+{
-+  return __builtin_aarch64_floatdihf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vcvth_f16_u16 (uint16_t __a)
-+{
-+  return __builtin_aarch64_floatunshihf_us (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vcvth_f16_u32 (uint32_t __a)
-+{
-+  return __builtin_aarch64_floatunssihf_us (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vcvth_f16_u64 (uint64_t __a)
-+{
-+  return __builtin_aarch64_floatunsdihf_us (__a);
-+}
-+
-+__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-+vcvth_s16_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_fix_trunchfhi (__a);
-+}
-+
-+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-+vcvth_s32_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_fix_trunchfsi (__a);
-+}
-+
-+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-+vcvth_s64_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_fix_trunchfdi (__a);
-+}
-+
-+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+vcvth_u16_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_fixuns_trunchfhi_us (__a);
-+}
-+
-+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+vcvth_u32_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_fixuns_trunchfsi_us (__a);
-+}
-+
-+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-+vcvth_u64_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_fixuns_trunchfdi_us (__a);
-+}
-+
-+__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-+vcvtah_s16_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_lroundhfhi (__a);
-+}
-+
-+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-+vcvtah_s32_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_lroundhfsi (__a);
-+}
-+
-+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-+vcvtah_s64_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_lroundhfdi (__a);
-+}
-+
-+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+vcvtah_u16_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_lrounduhfhi_us (__a);
-+}
-+
-+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+vcvtah_u32_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_lrounduhfsi_us (__a);
-+}
-+
-+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-+vcvtah_u64_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_lrounduhfdi_us (__a);
-+}
-+
-+__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-+vcvtmh_s16_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_lfloorhfhi (__a);
-+}
-+
-+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-+vcvtmh_s32_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_lfloorhfsi (__a);
-+}
-+
-+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-+vcvtmh_s64_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_lfloorhfdi (__a);
-+}
-+
-+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+vcvtmh_u16_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_lflooruhfhi_us (__a);
-+}
-+
-+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+vcvtmh_u32_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_lflooruhfsi_us (__a);
-+}
-+
-+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-+vcvtmh_u64_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_lflooruhfdi_us (__a);
-+}
-+
-+__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-+vcvtnh_s16_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_lfrintnhfhi (__a);
-+}
-+
-+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-+vcvtnh_s32_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_lfrintnhfsi (__a);
-+}
-+
-+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-+vcvtnh_s64_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_lfrintnhfdi (__a);
-+}
-+
-+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+vcvtnh_u16_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_lfrintnuhfhi_us (__a);
-+}
-+
-+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+vcvtnh_u32_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_lfrintnuhfsi_us (__a);
-+}
-+
-+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-+vcvtnh_u64_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_lfrintnuhfdi_us (__a);
-+}
-+
-+__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-+vcvtph_s16_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_lceilhfhi (__a);
-+}
-+
-+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-+vcvtph_s32_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_lceilhfsi (__a);
-+}
-+
-+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-+vcvtph_s64_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_lceilhfdi (__a);
-+}
-+
-+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+vcvtph_u16_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_lceiluhfhi_us (__a);
-+}
-+
-+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+vcvtph_u32_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_lceiluhfsi_us (__a);
-+}
-+
-+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-+vcvtph_u64_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_lceiluhfdi_us (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vnegh_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_neghf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrecpeh_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_frecpehf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrecpxh_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_frecpxhf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrndh_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_btrunchf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrndah_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_roundhf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrndih_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_nearbyinthf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrndmh_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_floorhf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrndnh_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_frintnhf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrndph_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_ceilhf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrndxh_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_rinthf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrsqrteh_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_rsqrtehf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vsqrth_f16 (float16_t __a)
-+{
-+  return __builtin_aarch64_sqrthf (__a);
-+}
-+
-+/* ARMv8.2-A FP16 two operands scalar intrinsics.  */
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vaddh_f16 (float16_t __a, float16_t __b)
-+{
-+  return __a + __b;
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vabdh_f16 (float16_t __a, float16_t __b)
-+{
-+  return __builtin_aarch64_fabdhf (__a, __b);
-+}
-+
-+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+vcageh_f16 (float16_t __a, float16_t __b)
-+{
-+  return __builtin_aarch64_facgehf_uss (__a, __b);
-+}
-+
-+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+vcagth_f16 (float16_t __a, float16_t __b)
-+{
-+  return __builtin_aarch64_facgthf_uss (__a, __b);
-+}
-+
-+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+vcaleh_f16 (float16_t __a, float16_t __b)
-+{
-+  return __builtin_aarch64_faclehf_uss (__a, __b);
-+}
-+
-+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+vcalth_f16 (float16_t __a, float16_t __b)
-+{
-+  return __builtin_aarch64_faclthf_uss (__a, __b);
-+}
-+
-+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+vceqh_f16 (float16_t __a, float16_t __b)
-+{
-+  return __builtin_aarch64_cmeqhf_uss (__a, __b);
-+}
-+
-+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+vcgeh_f16 (float16_t __a, float16_t __b)
-+{
-+  return __builtin_aarch64_cmgehf_uss (__a, __b);
-+}
-+
-+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+vcgth_f16 (float16_t __a, float16_t __b)
-+{
-+  return __builtin_aarch64_cmgthf_uss (__a, __b);
-+}
-+
-+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+vcleh_f16 (float16_t __a, float16_t __b)
-+{
-+  return __builtin_aarch64_cmlehf_uss (__a, __b);
-+}
-+
-+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+vclth_f16 (float16_t __a, float16_t __b)
-+{
-+  return __builtin_aarch64_cmlthf_uss (__a, __b);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vcvth_n_f16_s16 (int16_t __a, const int __b)
-+{
-+  return __builtin_aarch64_scvtfhi (__a, __b);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vcvth_n_f16_s32 (int32_t __a, const int __b)
-+{
-+  return __builtin_aarch64_scvtfsihf (__a, __b);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vcvth_n_f16_s64 (int64_t __a, const int __b)
-+{
-+  return __builtin_aarch64_scvtfdihf (__a, __b);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vcvth_n_f16_u16 (uint16_t __a, const int __b)
-+{
-+  return __builtin_aarch64_ucvtfhi_sus (__a, __b);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vcvth_n_f16_u32 (uint32_t __a, const int __b)
-+{
-+  return __builtin_aarch64_ucvtfsihf_sus (__a, __b);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vcvth_n_f16_u64 (uint64_t __a, const int __b)
-+{
-+  return __builtin_aarch64_ucvtfdihf_sus (__a, __b);
-+}
-+
-+__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-+vcvth_n_s16_f16 (float16_t __a, const int __b)
-+{
-+  return __builtin_aarch64_fcvtzshf (__a, __b);
-+}
-+
-+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-+vcvth_n_s32_f16 (float16_t __a, const int __b)
-+{
-+  return __builtin_aarch64_fcvtzshfsi (__a, __b);
-+}
-+
-+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-+vcvth_n_s64_f16 (float16_t __a, const int __b)
-+{
-+  return __builtin_aarch64_fcvtzshfdi (__a, __b);
-+}
-+
-+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+vcvth_n_u16_f16 (float16_t __a, const int __b)
-+{
-+  return __builtin_aarch64_fcvtzuhf_uss (__a, __b);
-+}
-+
-+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+vcvth_n_u32_f16 (float16_t __a, const int __b)
-+{
-+  return __builtin_aarch64_fcvtzuhfsi_uss (__a, __b);
-+}
-+
-+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-+vcvth_n_u64_f16 (float16_t __a, const int __b)
-+{
-+  return __builtin_aarch64_fcvtzuhfdi_uss (__a, __b);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vdivh_f16 (float16_t __a, float16_t __b)
-+{
-+  return __a / __b;
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vmaxh_f16 (float16_t __a, float16_t __b)
-+{
-+  return __builtin_aarch64_fmaxhf (__a, __b);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vmaxnmh_f16 (float16_t __a, float16_t __b)
-+{
-+  return __builtin_aarch64_fmaxhf (__a, __b);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vminh_f16 (float16_t __a, float16_t __b)
-+{
-+  return __builtin_aarch64_fminhf (__a, __b);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vminnmh_f16 (float16_t __a, float16_t __b)
-+{
-+  return __builtin_aarch64_fminhf (__a, __b);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vmulh_f16 (float16_t __a, float16_t __b)
-+{
-+  return __a * __b;
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vmulxh_f16 (float16_t __a, float16_t __b)
-+{
-+  return __builtin_aarch64_fmulxhf (__a, __b);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrecpsh_f16 (float16_t __a, float16_t __b)
-+{
-+  return __builtin_aarch64_frecpshf (__a, __b);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrsqrtsh_f16 (float16_t __a, float16_t __b)
-+{
-+  return __builtin_aarch64_rsqrtshf (__a, __b);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vsubh_f16 (float16_t __a, float16_t __b)
-+{
-+  return __a - __b;
-+}
-+
-+/* ARMv8.2-A FP16 three operands scalar intrinsics.  */
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vfmah_f16 (float16_t __a, float16_t __b, float16_t __c)
-+{
-+  return __builtin_aarch64_fmahf (__b, __c, __a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vfmsh_f16 (float16_t __a, float16_t __b, float16_t __c)
-+{
-+  return __builtin_aarch64_fnmahf (__b, __c, __a);
-+}
-+
-+#pragma GCC pop_options
-+
-+#endif
---- a/src/gcc/config/aarch64/arm_neon.h
-+++ b/src/gcc/config/aarch64/arm_neon.h
-@@ -58,6 +58,7 @@ typedef __Float64x2_t float64x2_t;
- typedef __Poly8x16_t poly8x16_t;
- typedef __Poly16x8_t poly16x8_t;
- typedef __Poly64x2_t poly64x2_t;
-+typedef __Poly64x1_t poly64x1_t;
- typedef __Uint8x16_t uint8x16_t;
- typedef __Uint16x8_t uint16x8_t;
- typedef __Uint32x4_t uint32x4_t;
-@@ -202,6 +203,36 @@ typedef struct poly16x8x2_t
-   poly16x8_t val[2];
- } poly16x8x2_t;
- 
-+typedef struct poly64x1x2_t
-+{
-+  poly64x1_t val[2];
-+} poly64x1x2_t;
-+
-+typedef struct poly64x1x3_t
-+{
-+  poly64x1_t val[3];
-+} poly64x1x3_t;
-+
-+typedef struct poly64x1x4_t
-+{
-+  poly64x1_t val[4];
-+} poly64x1x4_t;
-+
-+typedef struct poly64x2x2_t
-+{
-+  poly64x2_t val[2];
-+} poly64x2x2_t;
-+
-+typedef struct poly64x2x3_t
-+{
-+  poly64x2_t val[3];
-+} poly64x2x3_t;
-+
-+typedef struct poly64x2x4_t
-+{
-+  poly64x2_t val[4];
-+} poly64x2x4_t;
-+
- typedef struct int8x8x3_t
- {
-   int8x8_t val[3];
-@@ -466,6 +497,8 @@ typedef struct poly16x8x4_t
- #define __aarch64_vdup_lane_any(__size, __q, __a, __b) \
-   vdup##__q##_n_##__size (__aarch64_vget_lane_any (__a, __b))
- 
-+#define __aarch64_vdup_lane_f16(__a, __b) \
-+   __aarch64_vdup_lane_any (f16, , __a, __b)
- #define __aarch64_vdup_lane_f32(__a, __b) \
-    __aarch64_vdup_lane_any (f32, , __a, __b)
- #define __aarch64_vdup_lane_f64(__a, __b) \
-@@ -474,6 +507,8 @@ typedef struct poly16x8x4_t
-    __aarch64_vdup_lane_any (p8, , __a, __b)
- #define __aarch64_vdup_lane_p16(__a, __b) \
-    __aarch64_vdup_lane_any (p16, , __a, __b)
-+#define __aarch64_vdup_lane_p64(__a, __b) \
-+   __aarch64_vdup_lane_any (p64, , __a, __b)
- #define __aarch64_vdup_lane_s8(__a, __b) \
-    __aarch64_vdup_lane_any (s8, , __a, __b)
- #define __aarch64_vdup_lane_s16(__a, __b) \
-@@ -492,6 +527,8 @@ typedef struct poly16x8x4_t
-    __aarch64_vdup_lane_any (u64, , __a, __b)
- 
- /* __aarch64_vdup_laneq internal macros.  */
-+#define __aarch64_vdup_laneq_f16(__a, __b) \
-+   __aarch64_vdup_lane_any (f16, , __a, __b)
- #define __aarch64_vdup_laneq_f32(__a, __b) \
-    __aarch64_vdup_lane_any (f32, , __a, __b)
- #define __aarch64_vdup_laneq_f64(__a, __b) \
-@@ -500,6 +537,8 @@ typedef struct poly16x8x4_t
-    __aarch64_vdup_lane_any (p8, , __a, __b)
- #define __aarch64_vdup_laneq_p16(__a, __b) \
-    __aarch64_vdup_lane_any (p16, , __a, __b)
-+#define __aarch64_vdup_laneq_p64(__a, __b) \
-+   __aarch64_vdup_lane_any (p64, , __a, __b)
- #define __aarch64_vdup_laneq_s8(__a, __b) \
-    __aarch64_vdup_lane_any (s8, , __a, __b)
- #define __aarch64_vdup_laneq_s16(__a, __b) \
-@@ -518,6 +557,8 @@ typedef struct poly16x8x4_t
-    __aarch64_vdup_lane_any (u64, , __a, __b)
- 
- /* __aarch64_vdupq_lane internal macros.  */
-+#define __aarch64_vdupq_lane_f16(__a, __b) \
-+   __aarch64_vdup_lane_any (f16, q, __a, __b)
- #define __aarch64_vdupq_lane_f32(__a, __b) \
-    __aarch64_vdup_lane_any (f32, q, __a, __b)
- #define __aarch64_vdupq_lane_f64(__a, __b) \
-@@ -526,6 +567,8 @@ typedef struct poly16x8x4_t
-    __aarch64_vdup_lane_any (p8, q, __a, __b)
- #define __aarch64_vdupq_lane_p16(__a, __b) \
-    __aarch64_vdup_lane_any (p16, q, __a, __b)
-+#define __aarch64_vdupq_lane_p64(__a, __b) \
-+   __aarch64_vdup_lane_any (p64, q, __a, __b)
- #define __aarch64_vdupq_lane_s8(__a, __b) \
-    __aarch64_vdup_lane_any (s8, q, __a, __b)
- #define __aarch64_vdupq_lane_s16(__a, __b) \
-@@ -544,6 +587,8 @@ typedef struct poly16x8x4_t
-    __aarch64_vdup_lane_any (u64, q, __a, __b)
- 
- /* __aarch64_vdupq_laneq internal macros.  */
-+#define __aarch64_vdupq_laneq_f16(__a, __b) \
-+   __aarch64_vdup_lane_any (f16, q, __a, __b)
- #define __aarch64_vdupq_laneq_f32(__a, __b) \
-    __aarch64_vdup_lane_any (f32, q, __a, __b)
- #define __aarch64_vdupq_laneq_f64(__a, __b) \
-@@ -552,6 +597,8 @@ typedef struct poly16x8x4_t
-    __aarch64_vdup_lane_any (p8, q, __a, __b)
- #define __aarch64_vdupq_laneq_p16(__a, __b) \
-    __aarch64_vdup_lane_any (p16, q, __a, __b)
-+#define __aarch64_vdupq_laneq_p64(__a, __b) \
-+   __aarch64_vdup_lane_any (p64, q, __a, __b)
- #define __aarch64_vdupq_laneq_s8(__a, __b) \
-    __aarch64_vdup_lane_any (s8, q, __a, __b)
- #define __aarch64_vdupq_laneq_s16(__a, __b) \
-@@ -601,535 +648,619 @@ typedef struct poly16x8x4_t
-   })
- 
- /* vadd  */
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vadd_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vadd_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vadd_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vadd_f32 (float32x2_t __a, float32x2_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vadd_f64 (float64x1_t __a, float64x1_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vadd_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vadd_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vadd_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vadd_s64 (int64x1_t __a, int64x1_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vadd_u64 (uint64x1_t __a, uint64x1_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddq_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddq_f32 (float32x4_t __a, float32x4_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddq_f64 (float64x2_t __a, float64x2_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddq_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddl_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int16x8_t) __builtin_aarch64_saddlv8qi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddl_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int32x4_t) __builtin_aarch64_saddlv4hi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddl_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int64x2_t) __builtin_aarch64_saddlv2si (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddl_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint16x8_t) __builtin_aarch64_uaddlv8qi ((int8x8_t) __a,
- 						   (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddl_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint32x4_t) __builtin_aarch64_uaddlv4hi ((int16x4_t) __a,
- 						   (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddl_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint64x2_t) __builtin_aarch64_uaddlv2si ((int32x2_t) __a,
- 						   (int32x2_t) __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddl_high_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (int16x8_t) __builtin_aarch64_saddl2v16qi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddl_high_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int32x4_t) __builtin_aarch64_saddl2v8hi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddl_high_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int64x2_t) __builtin_aarch64_saddl2v4si (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddl_high_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return (uint16x8_t) __builtin_aarch64_uaddl2v16qi ((int8x16_t) __a,
- 						     (int8x16_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddl_high_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint32x4_t) __builtin_aarch64_uaddl2v8hi ((int16x8_t) __a,
- 						    (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddl_high_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint64x2_t) __builtin_aarch64_uaddl2v4si ((int32x4_t) __a,
- 						    (int32x4_t) __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddw_s8 (int16x8_t __a, int8x8_t __b)
- {
-   return (int16x8_t) __builtin_aarch64_saddwv8qi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddw_s16 (int32x4_t __a, int16x4_t __b)
- {
-   return (int32x4_t) __builtin_aarch64_saddwv4hi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddw_s32 (int64x2_t __a, int32x2_t __b)
- {
-   return (int64x2_t) __builtin_aarch64_saddwv2si (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddw_u8 (uint16x8_t __a, uint8x8_t __b)
- {
-   return (uint16x8_t) __builtin_aarch64_uaddwv8qi ((int16x8_t) __a,
- 						   (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddw_u16 (uint32x4_t __a, uint16x4_t __b)
- {
-   return (uint32x4_t) __builtin_aarch64_uaddwv4hi ((int32x4_t) __a,
- 						   (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddw_u32 (uint64x2_t __a, uint32x2_t __b)
- {
-   return (uint64x2_t) __builtin_aarch64_uaddwv2si ((int64x2_t) __a,
- 						   (int32x2_t) __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddw_high_s8 (int16x8_t __a, int8x16_t __b)
- {
-   return (int16x8_t) __builtin_aarch64_saddw2v16qi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddw_high_s16 (int32x4_t __a, int16x8_t __b)
- {
-   return (int32x4_t) __builtin_aarch64_saddw2v8hi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddw_high_s32 (int64x2_t __a, int32x4_t __b)
- {
-   return (int64x2_t) __builtin_aarch64_saddw2v4si (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddw_high_u8 (uint16x8_t __a, uint8x16_t __b)
- {
-   return (uint16x8_t) __builtin_aarch64_uaddw2v16qi ((int16x8_t) __a,
- 						     (int8x16_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddw_high_u16 (uint32x4_t __a, uint16x8_t __b)
- {
-   return (uint32x4_t) __builtin_aarch64_uaddw2v8hi ((int32x4_t) __a,
- 						    (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddw_high_u32 (uint64x2_t __a, uint32x4_t __b)
- {
-   return (uint64x2_t) __builtin_aarch64_uaddw2v4si ((int64x2_t) __a,
- 						    (int32x4_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vhadd_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int8x8_t) __builtin_aarch64_shaddv8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vhadd_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x4_t) __builtin_aarch64_shaddv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vhadd_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x2_t) __builtin_aarch64_shaddv2si (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vhadd_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint8x8_t) __builtin_aarch64_uhaddv8qi ((int8x8_t) __a,
- 						  (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vhadd_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint16x4_t) __builtin_aarch64_uhaddv4hi ((int16x4_t) __a,
- 						   (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vhadd_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint32x2_t) __builtin_aarch64_uhaddv2si ((int32x2_t) __a,
- 						   (int32x2_t) __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vhaddq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (int8x16_t) __builtin_aarch64_shaddv16qi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vhaddq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int16x8_t) __builtin_aarch64_shaddv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vhaddq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int32x4_t) __builtin_aarch64_shaddv4si (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vhaddq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return (uint8x16_t) __builtin_aarch64_uhaddv16qi ((int8x16_t) __a,
- 						    (int8x16_t) __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vhaddq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint16x8_t) __builtin_aarch64_uhaddv8hi ((int16x8_t) __a,
- 						   (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vhaddq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint32x4_t) __builtin_aarch64_uhaddv4si ((int32x4_t) __a,
- 						   (int32x4_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vrhadd_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int8x8_t) __builtin_aarch64_srhaddv8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vrhadd_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x4_t) __builtin_aarch64_srhaddv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vrhadd_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x2_t) __builtin_aarch64_srhaddv2si (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vrhadd_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint8x8_t) __builtin_aarch64_urhaddv8qi ((int8x8_t) __a,
- 						   (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vrhadd_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint16x4_t) __builtin_aarch64_urhaddv4hi ((int16x4_t) __a,
- 						    (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vrhadd_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint32x2_t) __builtin_aarch64_urhaddv2si ((int32x2_t) __a,
- 						    (int32x2_t) __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vrhaddq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (int8x16_t) __builtin_aarch64_srhaddv16qi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vrhaddq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int16x8_t) __builtin_aarch64_srhaddv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vrhaddq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int32x4_t) __builtin_aarch64_srhaddv4si (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vrhaddq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return (uint8x16_t) __builtin_aarch64_urhaddv16qi ((int8x16_t) __a,
- 						     (int8x16_t) __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vrhaddq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint16x8_t) __builtin_aarch64_urhaddv8hi ((int16x8_t) __a,
- 						    (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vrhaddq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint32x4_t) __builtin_aarch64_urhaddv4si ((int32x4_t) __a,
- 						    (int32x4_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddhn_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int8x8_t) __builtin_aarch64_addhnv8hi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddhn_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int16x4_t) __builtin_aarch64_addhnv4si (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddhn_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return (int32x2_t) __builtin_aarch64_addhnv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddhn_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint8x8_t) __builtin_aarch64_addhnv8hi ((int16x8_t) __a,
- 						  (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddhn_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint16x4_t) __builtin_aarch64_addhnv4si ((int32x4_t) __a,
- 						   (int32x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddhn_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return (uint32x2_t) __builtin_aarch64_addhnv2di ((int64x2_t) __a,
- 						   (int64x2_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vraddhn_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int8x8_t) __builtin_aarch64_raddhnv8hi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vraddhn_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int16x4_t) __builtin_aarch64_raddhnv4si (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vraddhn_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return (int32x2_t) __builtin_aarch64_raddhnv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vraddhn_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint8x8_t) __builtin_aarch64_raddhnv8hi ((int16x8_t) __a,
- 						   (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vraddhn_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint16x4_t) __builtin_aarch64_raddhnv4si ((int32x4_t) __a,
- 						    (int32x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vraddhn_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return (uint32x2_t) __builtin_aarch64_raddhnv2di ((int64x2_t) __a,
- 						    (int64x2_t) __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddhn_high_s16 (int8x8_t __a, int16x8_t __b, int16x8_t __c)
- {
-   return (int8x16_t) __builtin_aarch64_addhn2v8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddhn_high_s32 (int16x4_t __a, int32x4_t __b, int32x4_t __c)
- {
-   return (int16x8_t) __builtin_aarch64_addhn2v4si (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddhn_high_s64 (int32x2_t __a, int64x2_t __b, int64x2_t __c)
- {
-   return (int32x4_t) __builtin_aarch64_addhn2v2di (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
- {
-   return (uint8x16_t) __builtin_aarch64_addhn2v8hi ((int8x8_t) __a,
-@@ -1137,7 +1268,8 @@ vaddhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
- 						    (int16x8_t) __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
- {
-   return (uint16x8_t) __builtin_aarch64_addhn2v4si ((int16x4_t) __a,
-@@ -1145,7 +1277,8 @@ vaddhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
- 						    (int32x4_t) __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
- {
-   return (uint32x4_t) __builtin_aarch64_addhn2v2di ((int32x2_t) __a,
-@@ -1153,25 +1286,29 @@ vaddhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
- 						    (int64x2_t) __c);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vraddhn_high_s16 (int8x8_t __a, int16x8_t __b, int16x8_t __c)
- {
-   return (int8x16_t) __builtin_aarch64_raddhn2v8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vraddhn_high_s32 (int16x4_t __a, int32x4_t __b, int32x4_t __c)
- {
-   return (int16x8_t) __builtin_aarch64_raddhn2v4si (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vraddhn_high_s64 (int32x2_t __a, int64x2_t __b, int64x2_t __c)
- {
-   return (int32x4_t) __builtin_aarch64_raddhn2v2di (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vraddhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
- {
-   return (uint8x16_t) __builtin_aarch64_raddhn2v8hi ((int8x8_t) __a,
-@@ -1179,7 +1316,8 @@ vraddhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
- 						     (int16x8_t) __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vraddhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
- {
-   return (uint16x8_t) __builtin_aarch64_raddhn2v4si ((int16x4_t) __a,
-@@ -1187,7 +1325,8 @@ vraddhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
- 						     (int32x4_t) __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vraddhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
- {
-   return (uint32x4_t) __builtin_aarch64_raddhn2v2di ((int32x2_t) __a,
-@@ -1195,1101 +1334,1280 @@ vraddhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
- 						     (int64x2_t) __c);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vdiv_f32 (float32x2_t __a, float32x2_t __b)
- {
-   return __a / __b;
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vdiv_f64 (float64x1_t __a, float64x1_t __b)
- {
-   return __a / __b;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vdivq_f32 (float32x4_t __a, float32x4_t __b)
- {
-   return __a / __b;
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vdivq_f64 (float64x2_t __a, float64x2_t __b)
- {
-   return __a / __b;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_f32 (float32x2_t __a, float32x2_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_f64 (float64x1_t __a, float64x1_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_p8 (poly8x8_t __a, poly8x8_t __b)
- {
-   return (poly8x8_t) __builtin_aarch64_pmulv8qi ((int8x8_t) __a,
- 						 (int8x8_t) __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_f32 (float32x4_t __a, float32x4_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_f64 (float64x2_t __a, float64x2_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_p8 (poly8x16_t __a, poly8x16_t __b)
- {
-   return (poly8x16_t) __builtin_aarch64_pmulv16qi ((int8x16_t) __a,
- 						   (int8x16_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vand_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vand_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vand_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vand_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vand_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vand_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vand_s64 (int64x1_t __a, int64x1_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vand_u64 (uint64x1_t __a, uint64x1_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vandq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vandq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vandq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vandq_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vandq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vandq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vandq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vandq_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vorr_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vorr_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vorr_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vorr_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vorr_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vorr_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vorr_s64 (int64x1_t __a, int64x1_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vorr_u64 (uint64x1_t __a, uint64x1_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vorrq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vorrq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vorrq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vorrq_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vorrq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vorrq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vorrq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vorrq_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- veor_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- veor_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- veor_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- veor_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- veor_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- veor_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- veor_s64 (int64x1_t __a, int64x1_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- veor_u64 (uint64x1_t __a, uint64x1_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- veorq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- veorq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- veorq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- veorq_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- veorq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- veorq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- veorq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- veorq_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vbic_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vbic_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vbic_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vbic_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vbic_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vbic_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vbic_s64 (int64x1_t __a, int64x1_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vbic_u64 (uint64x1_t __a, uint64x1_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vbicq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vbicq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vbicq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vbicq_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vbicq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vbicq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vbicq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vbicq_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vorn_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vorn_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vorn_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vorn_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vorn_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vorn_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vorn_s64 (int64x1_t __a, int64x1_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vorn_u64 (uint64x1_t __a, uint64x1_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vornq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vornq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vornq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vornq_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vornq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vornq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vornq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vornq_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsub_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsub_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsub_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsub_f32 (float32x2_t __a, float32x2_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsub_f64 (float64x1_t __a, float64x1_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsub_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsub_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsub_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsub_s64 (int64x1_t __a, int64x1_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsub_u64 (uint64x1_t __a, uint64x1_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubq_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubq_f32 (float32x4_t __a, float32x4_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubq_f64 (float64x2_t __a, float64x2_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubq_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubl_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int16x8_t) __builtin_aarch64_ssublv8qi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubl_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int32x4_t) __builtin_aarch64_ssublv4hi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubl_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int64x2_t) __builtin_aarch64_ssublv2si (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubl_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint16x8_t) __builtin_aarch64_usublv8qi ((int8x8_t) __a,
- 						   (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubl_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint32x4_t) __builtin_aarch64_usublv4hi ((int16x4_t) __a,
- 						   (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubl_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint64x2_t) __builtin_aarch64_usublv2si ((int32x2_t) __a,
- 						   (int32x2_t) __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubl_high_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (int16x8_t) __builtin_aarch64_ssubl2v16qi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubl_high_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int32x4_t) __builtin_aarch64_ssubl2v8hi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubl_high_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int64x2_t) __builtin_aarch64_ssubl2v4si (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubl_high_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return (uint16x8_t) __builtin_aarch64_usubl2v16qi ((int8x16_t) __a,
- 						     (int8x16_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubl_high_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint32x4_t) __builtin_aarch64_usubl2v8hi ((int16x8_t) __a,
- 						    (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubl_high_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint64x2_t) __builtin_aarch64_usubl2v4si ((int32x4_t) __a,
- 						    (int32x4_t) __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubw_s8 (int16x8_t __a, int8x8_t __b)
- {
-   return (int16x8_t) __builtin_aarch64_ssubwv8qi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubw_s16 (int32x4_t __a, int16x4_t __b)
- {
-   return (int32x4_t) __builtin_aarch64_ssubwv4hi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubw_s32 (int64x2_t __a, int32x2_t __b)
- {
-   return (int64x2_t) __builtin_aarch64_ssubwv2si (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubw_u8 (uint16x8_t __a, uint8x8_t __b)
- {
-   return (uint16x8_t) __builtin_aarch64_usubwv8qi ((int16x8_t) __a,
- 						   (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubw_u16 (uint32x4_t __a, uint16x4_t __b)
- {
-   return (uint32x4_t) __builtin_aarch64_usubwv4hi ((int32x4_t) __a,
- 						   (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubw_u32 (uint64x2_t __a, uint32x2_t __b)
- {
-   return (uint64x2_t) __builtin_aarch64_usubwv2si ((int64x2_t) __a,
- 						   (int32x2_t) __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubw_high_s8 (int16x8_t __a, int8x16_t __b)
- {
-   return (int16x8_t) __builtin_aarch64_ssubw2v16qi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubw_high_s16 (int32x4_t __a, int16x8_t __b)
- {
-   return (int32x4_t) __builtin_aarch64_ssubw2v8hi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubw_high_s32 (int64x2_t __a, int32x4_t __b)
- {
-   return (int64x2_t) __builtin_aarch64_ssubw2v4si (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubw_high_u8 (uint16x8_t __a, uint8x16_t __b)
- {
-   return (uint16x8_t) __builtin_aarch64_usubw2v16qi ((int16x8_t) __a,
- 						     (int8x16_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubw_high_u16 (uint32x4_t __a, uint16x8_t __b)
- {
-   return (uint32x4_t) __builtin_aarch64_usubw2v8hi ((int32x4_t) __a,
- 						    (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubw_high_u32 (uint64x2_t __a, uint32x4_t __b)
- {
-   return (uint64x2_t) __builtin_aarch64_usubw2v4si ((int64x2_t) __a,
- 						    (int32x4_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqadd_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int8x8_t) __builtin_aarch64_sqaddv8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqadd_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x4_t) __builtin_aarch64_sqaddv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqadd_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x2_t) __builtin_aarch64_sqaddv2si (__a, __b);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqadd_s64 (int64x1_t __a, int64x1_t __b)
- {
-   return (int64x1_t) {__builtin_aarch64_sqadddi (__a[0], __b[0])};
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqadd_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return __builtin_aarch64_uqaddv8qi_uuu (__a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vhsub_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int8x8_t)__builtin_aarch64_shsubv8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vhsub_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x4_t) __builtin_aarch64_shsubv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vhsub_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x2_t) __builtin_aarch64_shsubv2si (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vhsub_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint8x8_t) __builtin_aarch64_uhsubv8qi ((int8x8_t) __a,
- 						  (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vhsub_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint16x4_t) __builtin_aarch64_uhsubv4hi ((int16x4_t) __a,
- 						   (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vhsub_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint32x2_t) __builtin_aarch64_uhsubv2si ((int32x2_t) __a,
- 						   (int32x2_t) __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vhsubq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (int8x16_t) __builtin_aarch64_shsubv16qi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vhsubq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int16x8_t) __builtin_aarch64_shsubv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vhsubq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int32x4_t) __builtin_aarch64_shsubv4si (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vhsubq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return (uint8x16_t) __builtin_aarch64_uhsubv16qi ((int8x16_t) __a,
- 						    (int8x16_t) __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vhsubq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint16x8_t) __builtin_aarch64_uhsubv8hi ((int16x8_t) __a,
- 						   (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vhsubq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint32x4_t) __builtin_aarch64_uhsubv4si ((int32x4_t) __a,
- 						   (int32x4_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubhn_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int8x8_t) __builtin_aarch64_subhnv8hi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubhn_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int16x4_t) __builtin_aarch64_subhnv4si (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubhn_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return (int32x2_t) __builtin_aarch64_subhnv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubhn_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint8x8_t) __builtin_aarch64_subhnv8hi ((int16x8_t) __a,
- 						  (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubhn_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint16x4_t) __builtin_aarch64_subhnv4si ((int32x4_t) __a,
- 						   (int32x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubhn_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return (uint32x2_t) __builtin_aarch64_subhnv2di ((int64x2_t) __a,
- 						   (int64x2_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vrsubhn_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int8x8_t) __builtin_aarch64_rsubhnv8hi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vrsubhn_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int16x4_t) __builtin_aarch64_rsubhnv4si (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vrsubhn_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return (int32x2_t) __builtin_aarch64_rsubhnv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vrsubhn_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint8x8_t) __builtin_aarch64_rsubhnv8hi ((int16x8_t) __a,
- 						   (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vrsubhn_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint16x4_t) __builtin_aarch64_rsubhnv4si ((int32x4_t) __a,
- 						    (int32x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vrsubhn_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return (uint32x2_t) __builtin_aarch64_rsubhnv2di ((int64x2_t) __a,
- 						    (int64x2_t) __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vrsubhn_high_s16 (int8x8_t __a, int16x8_t __b, int16x8_t __c)
- {
-   return (int8x16_t) __builtin_aarch64_rsubhn2v8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vrsubhn_high_s32 (int16x4_t __a, int32x4_t __b, int32x4_t __c)
- {
-   return (int16x8_t) __builtin_aarch64_rsubhn2v4si (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vrsubhn_high_s64 (int32x2_t __a, int64x2_t __b, int64x2_t __c)
- {
-   return (int32x4_t) __builtin_aarch64_rsubhn2v2di (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vrsubhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
- {
-   return (uint8x16_t) __builtin_aarch64_rsubhn2v8hi ((int8x8_t) __a,
-@@ -2297,7 +2615,8 @@ vrsubhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
- 						     (int16x8_t) __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vrsubhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
- {
-   return (uint16x8_t) __builtin_aarch64_rsubhn2v4si ((int16x4_t) __a,
-@@ -2305,7 +2624,8 @@ vrsubhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
- 						     (int32x4_t) __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vrsubhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
- {
-   return (uint32x4_t) __builtin_aarch64_rsubhn2v2di ((int32x2_t) __a,
-@@ -2313,25 +2633,29 @@ vrsubhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
- 						     (int64x2_t) __c);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubhn_high_s16 (int8x8_t __a, int16x8_t __b, int16x8_t __c)
- {
-   return (int8x16_t) __builtin_aarch64_subhn2v8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubhn_high_s32 (int16x4_t __a, int32x4_t __b, int32x4_t __c)
- {
-   return (int16x8_t) __builtin_aarch64_subhn2v4si (__a, __b, __c);;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubhn_high_s64 (int32x2_t __a, int64x2_t __b, int64x2_t __c)
- {
-   return (int32x4_t) __builtin_aarch64_subhn2v2di (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
- {
-   return (uint8x16_t) __builtin_aarch64_subhn2v8hi ((int8x8_t) __a,
-@@ -2339,7 +2663,8 @@ vsubhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
- 						    (int16x8_t) __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
- {
-   return (uint16x8_t) __builtin_aarch64_subhn2v4si ((int16x4_t) __a,
-@@ -2347,7 +2672,8 @@ vsubhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
- 						    (int32x4_t) __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsubhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
- {
-   return (uint32x4_t) __builtin_aarch64_subhn2v2di ((int32x2_t) __a,
-@@ -2355,453 +2681,542 @@ vsubhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
- 						    (int64x2_t) __c);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqadd_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return __builtin_aarch64_uqaddv4hi_uuu (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqadd_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return __builtin_aarch64_uqaddv2si_uuu (__a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqadd_u64 (uint64x1_t __a, uint64x1_t __b)
- {
-   return (uint64x1_t) {__builtin_aarch64_uqadddi_uuu (__a[0], __b[0])};
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqaddq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (int8x16_t) __builtin_aarch64_sqaddv16qi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqaddq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int16x8_t) __builtin_aarch64_sqaddv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqaddq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int32x4_t) __builtin_aarch64_sqaddv4si (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqaddq_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return (int64x2_t) __builtin_aarch64_sqaddv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqaddq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return __builtin_aarch64_uqaddv16qi_uuu (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqaddq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return __builtin_aarch64_uqaddv8hi_uuu (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqaddq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return __builtin_aarch64_uqaddv4si_uuu (__a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqaddq_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return __builtin_aarch64_uqaddv2di_uuu (__a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqsub_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int8x8_t) __builtin_aarch64_sqsubv8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqsub_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x4_t) __builtin_aarch64_sqsubv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqsub_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x2_t) __builtin_aarch64_sqsubv2si (__a, __b);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqsub_s64 (int64x1_t __a, int64x1_t __b)
- {
-   return (int64x1_t) {__builtin_aarch64_sqsubdi (__a[0], __b[0])};
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqsub_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return __builtin_aarch64_uqsubv8qi_uuu (__a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqsub_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return __builtin_aarch64_uqsubv4hi_uuu (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqsub_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return __builtin_aarch64_uqsubv2si_uuu (__a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqsub_u64 (uint64x1_t __a, uint64x1_t __b)
- {
-   return (uint64x1_t) {__builtin_aarch64_uqsubdi_uuu (__a[0], __b[0])};
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqsubq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (int8x16_t) __builtin_aarch64_sqsubv16qi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqsubq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int16x8_t) __builtin_aarch64_sqsubv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqsubq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int32x4_t) __builtin_aarch64_sqsubv4si (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqsubq_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return (int64x2_t) __builtin_aarch64_sqsubv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqsubq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return __builtin_aarch64_uqsubv16qi_uuu (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqsubq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return __builtin_aarch64_uqsubv8hi_uuu (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqsubq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return __builtin_aarch64_uqsubv4si_uuu (__a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqsubq_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return __builtin_aarch64_uqsubv2di_uuu (__a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqneg_s8 (int8x8_t __a)
- {
-   return (int8x8_t) __builtin_aarch64_sqnegv8qi (__a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqneg_s16 (int16x4_t __a)
- {
-   return (int16x4_t) __builtin_aarch64_sqnegv4hi (__a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqneg_s32 (int32x2_t __a)
- {
-   return (int32x2_t) __builtin_aarch64_sqnegv2si (__a);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqneg_s64 (int64x1_t __a)
- {
-   return (int64x1_t) {__builtin_aarch64_sqnegdi (__a[0])};
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqnegq_s8 (int8x16_t __a)
- {
-   return (int8x16_t) __builtin_aarch64_sqnegv16qi (__a);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqnegq_s16 (int16x8_t __a)
- {
-   return (int16x8_t) __builtin_aarch64_sqnegv8hi (__a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqnegq_s32 (int32x4_t __a)
- {
-   return (int32x4_t) __builtin_aarch64_sqnegv4si (__a);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqabs_s8 (int8x8_t __a)
- {
-   return (int8x8_t) __builtin_aarch64_sqabsv8qi (__a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqabs_s16 (int16x4_t __a)
- {
-   return (int16x4_t) __builtin_aarch64_sqabsv4hi (__a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqabs_s32 (int32x2_t __a)
- {
-   return (int32x2_t) __builtin_aarch64_sqabsv2si (__a);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqabs_s64 (int64x1_t __a)
- {
-   return (int64x1_t) {__builtin_aarch64_sqabsdi (__a[0])};
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqabsq_s8 (int8x16_t __a)
- {
-   return (int8x16_t) __builtin_aarch64_sqabsv16qi (__a);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqabsq_s16 (int16x8_t __a)
- {
-   return (int16x8_t) __builtin_aarch64_sqabsv8hi (__a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqabsq_s32 (int32x4_t __a)
- {
-   return (int32x4_t) __builtin_aarch64_sqabsv4si (__a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmulh_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x4_t) __builtin_aarch64_sqdmulhv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmulh_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x2_t) __builtin_aarch64_sqdmulhv2si (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmulhq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int16x8_t) __builtin_aarch64_sqdmulhv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmulhq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int32x4_t) __builtin_aarch64_sqdmulhv4si (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmulh_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x4_t) __builtin_aarch64_sqrdmulhv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmulh_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x2_t) __builtin_aarch64_sqrdmulhv2si (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmulhq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int16x8_t) __builtin_aarch64_sqrdmulhv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmulhq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int32x4_t) __builtin_aarch64_sqrdmulhv4si (__a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_s8 (uint64_t __a)
- {
-   return (int8x8_t) __a;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_s16 (uint64_t __a)
- {
-   return (int16x4_t) __a;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_s32 (uint64_t __a)
- {
-   return (int32x2_t) __a;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_s64 (uint64_t __a)
- {
-   return (int64x1_t) {__a};
- }
- 
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_f16 (uint64_t __a)
- {
-   return (float16x4_t) __a;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_f32 (uint64_t __a)
- {
-   return (float32x2_t) __a;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_u8 (uint64_t __a)
- {
-   return (uint8x8_t) __a;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_u16 (uint64_t __a)
- {
-   return (uint16x4_t) __a;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_u32 (uint64_t __a)
- {
-   return (uint32x2_t) __a;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_u64 (uint64_t __a)
- {
-   return (uint64x1_t) {__a};
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_f64 (uint64_t __a)
- {
-   return (float64x1_t) __a;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_p8 (uint64_t __a)
- {
-   return (poly8x8_t) __a;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_p16 (uint64_t __a)
- {
-   return (poly16x4_t) __a;
- }
- 
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcreate_p64 (uint64_t __a)
-+{
-+  return (poly64x1_t) __a;
-+}
-+
- /* vget_lane  */
- 
--__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_lane_f16 (float16x4_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_lane_f32 (float32x2_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_lane_f64 (float64x1_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_lane_p8 (poly8x8_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_lane_p16 (poly16x4_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vget_lane_p64 (poly64x1_t __a, const int __b)
-+{
-+  return __aarch64_vget_lane_any (__a, __b);
-+}
-+
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_lane_s8 (int8x8_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_lane_s16 (int16x4_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_lane_s32 (int32x2_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_lane_s64 (int64x1_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_lane_u8 (uint8x8_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_lane_u16 (uint16x4_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_lane_u32 (uint32x2_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_lane_u64 (uint64x1_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
-@@ -2809,79 +3224,99 @@ vget_lane_u64 (uint64x1_t __a, const int __b)
- 
- /* vgetq_lane  */
- 
--__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vgetq_lane_f16 (float16x8_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vgetq_lane_f32 (float32x4_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vgetq_lane_f64 (float64x2_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vgetq_lane_p8 (poly8x16_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vgetq_lane_p16 (poly16x8_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vgetq_lane_p64 (poly64x2_t __a, const int __b)
-+{
-+  return __aarch64_vget_lane_any (__a, __b);
-+}
-+
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vgetq_lane_s8 (int8x16_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vgetq_lane_s16 (int16x8_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vgetq_lane_s32 (int32x4_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vgetq_lane_s64 (int64x2_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vgetq_lane_u8 (uint8x16_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vgetq_lane_u16 (uint16x8_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vgetq_lane_u32 (uint32x4_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vgetq_lane_u64 (uint64x2_t __a, const int __b)
- {
-   return __aarch64_vget_lane_any (__a, __b);
-@@ -2889,1953 +3324,2832 @@ vgetq_lane_u64 (uint64x2_t __a, const int __b)
- 
- /* vreinterpret  */
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p8_f16 (float16x4_t __a)
- {
-   return (poly8x8_t) __a;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p8_f64 (float64x1_t __a)
- {
-   return (poly8x8_t) __a;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p8_s8 (int8x8_t __a)
- {
-   return (poly8x8_t) __a;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p8_s16 (int16x4_t __a)
- {
-   return (poly8x8_t) __a;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p8_s32 (int32x2_t __a)
- {
-   return (poly8x8_t) __a;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p8_s64 (int64x1_t __a)
- {
-   return (poly8x8_t) __a;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p8_f32 (float32x2_t __a)
- {
-   return (poly8x8_t) __a;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p8_u8 (uint8x8_t __a)
- {
-   return (poly8x8_t) __a;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p8_u16 (uint16x4_t __a)
- {
-   return (poly8x8_t) __a;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p8_u32 (uint32x2_t __a)
- {
-   return (poly8x8_t) __a;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p8_u64 (uint64x1_t __a)
- {
-   return (poly8x8_t) __a;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p8_p16 (poly16x4_t __a)
- {
-   return (poly8x8_t) __a;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_p8_p64 (poly64x1_t __a)
-+{
-+  return (poly8x8_t) __a;
-+}
-+
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_f64 (float64x2_t __a)
- {
-   return (poly8x16_t) __a;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_s8 (int8x16_t __a)
- {
-   return (poly8x16_t) __a;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_s16 (int16x8_t __a)
- {
-   return (poly8x16_t) __a;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_s32 (int32x4_t __a)
- {
-   return (poly8x16_t) __a;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_s64 (int64x2_t __a)
- {
-   return (poly8x16_t) __a;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_f16 (float16x8_t __a)
- {
-   return (poly8x16_t) __a;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_f32 (float32x4_t __a)
- {
-   return (poly8x16_t) __a;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_u8 (uint8x16_t __a)
- {
-   return (poly8x16_t) __a;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_u16 (uint16x8_t __a)
- {
-   return (poly8x16_t) __a;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_u32 (uint32x4_t __a)
- {
-   return (poly8x16_t) __a;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_u64 (uint64x2_t __a)
- {
-   return (poly8x16_t) __a;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_p16 (poly16x8_t __a)
- {
-   return (poly8x16_t) __a;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p8_p64 (poly64x2_t __a)
-+{
-+  return (poly8x16_t) __a;
-+}
-+
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p8_p128 (poly128_t __a)
-+{
-+  return (poly8x16_t)__a;
-+}
-+
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p16_f16 (float16x4_t __a)
- {
-   return (poly16x4_t) __a;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p16_f64 (float64x1_t __a)
- {
-   return (poly16x4_t) __a;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p16_s8 (int8x8_t __a)
- {
-   return (poly16x4_t) __a;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p16_s16 (int16x4_t __a)
- {
-   return (poly16x4_t) __a;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p16_s32 (int32x2_t __a)
- {
-   return (poly16x4_t) __a;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p16_s64 (int64x1_t __a)
- {
-   return (poly16x4_t) __a;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p16_f32 (float32x2_t __a)
- {
-   return (poly16x4_t) __a;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p16_u8 (uint8x8_t __a)
- {
-   return (poly16x4_t) __a;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p16_u16 (uint16x4_t __a)
- {
-   return (poly16x4_t) __a;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p16_u32 (uint32x2_t __a)
- {
-   return (poly16x4_t) __a;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p16_u64 (uint64x1_t __a)
- {
-   return (poly16x4_t) __a;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p16_p8 (poly8x8_t __a)
- {
-   return (poly16x4_t) __a;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_p16_p64 (poly64x1_t __a)
-+{
-+  return (poly16x4_t) __a;
-+}
-+
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_f64 (float64x2_t __a)
- {
-   return (poly16x8_t) __a;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_s8 (int8x16_t __a)
- {
-   return (poly16x8_t) __a;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_s16 (int16x8_t __a)
- {
-   return (poly16x8_t) __a;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_s32 (int32x4_t __a)
- {
-   return (poly16x8_t) __a;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_s64 (int64x2_t __a)
- {
-   return (poly16x8_t) __a;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_f16 (float16x8_t __a)
- {
-   return (poly16x8_t) __a;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_f32 (float32x4_t __a)
- {
-   return (poly16x8_t) __a;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_u8 (uint8x16_t __a)
- {
-   return (poly16x8_t) __a;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_u16 (uint16x8_t __a)
- {
-   return (poly16x8_t) __a;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_u32 (uint32x4_t __a)
- {
-   return (poly16x8_t) __a;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_u64 (uint64x2_t __a)
- {
-   return (poly16x8_t) __a;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_p8 (poly8x16_t __a)
- {
-   return (poly16x8_t) __a;
- }
- 
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p16_p64 (poly64x2_t __a)
-+{
-+  return (poly16x8_t) __a;
-+}
-+
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p16_p128 (poly128_t __a)
-+{
-+  return (poly16x8_t)__a;
-+}
-+
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_p64_f16 (float16x4_t __a)
-+{
-+  return (poly64x1_t) __a;
-+}
-+
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_p64_f64 (float64x1_t __a)
-+{
-+  return (poly64x1_t) __a;
-+}
-+
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_p64_s8 (int8x8_t __a)
-+{
-+  return (poly64x1_t) __a;
-+}
-+
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_p64_s16 (int16x4_t __a)
-+{
-+  return (poly64x1_t) __a;
-+}
-+
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_p64_s32 (int32x2_t __a)
-+{
-+  return (poly64x1_t) __a;
-+}
-+
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_p64_s64 (int64x1_t __a)
-+{
-+  return (poly64x1_t) __a;
-+}
-+
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_p64_f32 (float32x2_t __a)
-+{
-+  return (poly64x1_t) __a;
-+}
-+
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_p64_u8 (uint8x8_t __a)
-+{
-+  return (poly64x1_t) __a;
-+}
-+
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_p64_u16 (uint16x4_t __a)
-+{
-+  return (poly64x1_t) __a;
-+}
-+
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_p64_u32 (uint32x2_t __a)
-+{
-+  return (poly64x1_t) __a;
-+}
-+
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_p64_u64 (uint64x1_t __a)
-+{
-+  return (poly64x1_t) __a;
-+}
-+
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_p64_p8 (poly8x8_t __a)
-+{
-+  return (poly64x1_t) __a;
-+}
-+
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_p64_p16 (poly16x4_t __a)
-+{
-+  return (poly64x1_t)__a;
-+}
-+
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p64_f64 (float64x2_t __a)
-+{
-+  return (poly64x2_t) __a;
-+}
-+
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p64_s8 (int8x16_t __a)
-+{
-+  return (poly64x2_t) __a;
-+}
-+
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p64_s16 (int16x8_t __a)
-+{
-+  return (poly64x2_t) __a;
-+}
-+
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p64_s32 (int32x4_t __a)
-+{
-+  return (poly64x2_t) __a;
-+}
-+
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p64_s64 (int64x2_t __a)
-+{
-+  return (poly64x2_t) __a;
-+}
-+
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p64_f16 (float16x8_t __a)
-+{
-+  return (poly64x2_t) __a;
-+}
-+
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p64_f32 (float32x4_t __a)
-+{
-+  return (poly64x2_t) __a;
-+}
-+
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p64_p128 (poly128_t __a)
-+{
-+  return (poly64x2_t)__a;
-+}
-+
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p64_u8 (uint8x16_t __a)
-+{
-+  return (poly64x2_t) __a;
-+}
-+
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p64_u16 (uint16x8_t __a)
-+{
-+  return (poly64x2_t) __a;
-+}
-+
-+__extension__ extern __inline poly64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p64_p16 (poly16x8_t __a)
-+{
-+  return (poly64x2_t)__a;
-+}
-+
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p64_u32 (uint32x4_t __a)
-+{
-+  return (poly64x2_t) __a;
-+}
-+
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p64_u64 (uint64x2_t __a)
-+{
-+  return (poly64x2_t) __a;
-+}
-+
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p64_p8 (poly8x16_t __a)
-+{
-+  return (poly64x2_t) __a;
-+}
-+
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p128_p8 (poly8x16_t __a)
-+{
-+  return (poly128_t)__a;
-+}
-+
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p128_p16 (poly16x8_t __a)
-+{
-+  return (poly128_t)__a;
-+}
-+
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p128_f16 (float16x8_t __a)
-+{
-+  return (poly128_t) __a;
-+}
-+
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p128_f32 (float32x4_t __a)
-+{
-+  return (poly128_t)__a;
-+}
-+
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p128_p64 (poly64x2_t __a)
-+{
-+  return (poly128_t)__a;
-+}
-+
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p128_s64 (int64x2_t __a)
-+{
-+  return (poly128_t)__a;
-+}
-+
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p128_u64 (uint64x2_t __a)
-+{
-+  return (poly128_t)__a;
-+}
-+
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p128_s8 (int8x16_t __a)
-+{
-+  return (poly128_t)__a;
-+}
-+
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p128_s16 (int16x8_t __a)
-+{
-+  return (poly128_t)__a;
-+}
-+
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p128_s32 (int32x4_t __a)
-+{
-+  return (poly128_t)__a;
-+}
-+
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p128_u8 (uint8x16_t __a)
-+{
-+  return (poly128_t)__a;
-+}
-+
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p128_u16 (uint16x8_t __a)
-+{
-+  return (poly128_t)__a;
-+}
-+
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_p128_u32 (uint32x4_t __a)
-+{
-+  return (poly128_t)__a;
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f16_f64 (float64x1_t __a)
- {
-   return (float16x4_t) __a;
- }
- 
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f16_s8 (int8x8_t __a)
- {
-   return (float16x4_t) __a;
- }
- 
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f16_s16 (int16x4_t __a)
- {
-   return (float16x4_t) __a;
- }
- 
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f16_s32 (int32x2_t __a)
- {
-   return (float16x4_t) __a;
- }
- 
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f16_s64 (int64x1_t __a)
- {
-   return (float16x4_t) __a;
- }
- 
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f16_f32 (float32x2_t __a)
- {
-   return (float16x4_t) __a;
- }
- 
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f16_u8 (uint8x8_t __a)
- {
-   return (float16x4_t) __a;
- }
- 
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f16_u16 (uint16x4_t __a)
- {
-   return (float16x4_t) __a;
- }
- 
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f16_u32 (uint32x2_t __a)
- {
-   return (float16x4_t) __a;
- }
- 
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f16_u64 (uint64x1_t __a)
- {
-   return (float16x4_t) __a;
- }
- 
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f16_p8 (poly8x8_t __a)
- {
-   return (float16x4_t) __a;
- }
- 
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f16_p16 (poly16x4_t __a)
- {
-   return (float16x4_t) __a;
- }
- 
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_f16_p64 (poly64x1_t __a)
-+{
-+  return (float16x4_t) __a;
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_f64 (float64x2_t __a)
- {
-   return (float16x8_t) __a;
- }
- 
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_s8 (int8x16_t __a)
- {
-   return (float16x8_t) __a;
- }
- 
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_s16 (int16x8_t __a)
- {
-   return (float16x8_t) __a;
- }
- 
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_s32 (int32x4_t __a)
- {
-   return (float16x8_t) __a;
- }
- 
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_s64 (int64x2_t __a)
- {
-   return (float16x8_t) __a;
- }
- 
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_f32 (float32x4_t __a)
- {
-   return (float16x8_t) __a;
- }
- 
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_u8 (uint8x16_t __a)
- {
-   return (float16x8_t) __a;
- }
- 
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_u16 (uint16x8_t __a)
- {
-   return (float16x8_t) __a;
- }
- 
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_u32 (uint32x4_t __a)
- {
-   return (float16x8_t) __a;
- }
- 
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_u64 (uint64x2_t __a)
- {
-   return (float16x8_t) __a;
- }
- 
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_p8 (poly8x16_t __a)
- {
-   return (float16x8_t) __a;
- }
- 
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_f16_p128 (poly128_t __a)
-+{
-+  return (float16x8_t) __a;
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_p16 (poly16x8_t __a)
- {
-   return (float16x8_t) __a;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_f16_p64 (poly64x2_t __a)
-+{
-+  return (float16x8_t) __a;
-+}
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f32_f16 (float16x4_t __a)
- {
-   return (float32x2_t) __a;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f32_f64 (float64x1_t __a)
- {
-   return (float32x2_t) __a;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f32_s8 (int8x8_t __a)
- {
-   return (float32x2_t) __a;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f32_s16 (int16x4_t __a)
- {
-   return (float32x2_t) __a;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f32_s32 (int32x2_t __a)
- {
-   return (float32x2_t) __a;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f32_s64 (int64x1_t __a)
- {
-   return (float32x2_t) __a;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f32_u8 (uint8x8_t __a)
- {
-   return (float32x2_t) __a;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f32_u16 (uint16x4_t __a)
- {
-   return (float32x2_t) __a;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f32_u32 (uint32x2_t __a)
- {
-   return (float32x2_t) __a;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f32_u64 (uint64x1_t __a)
- {
-   return (float32x2_t) __a;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f32_p8 (poly8x8_t __a)
- {
-   return (float32x2_t) __a;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f32_p16 (poly16x4_t __a)
- {
-   return (float32x2_t) __a;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_f32_p64 (poly64x1_t __a)
-+{
-+  return (float32x2_t) __a;
-+}
-+
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_f16 (float16x8_t __a)
- {
-   return (float32x4_t) __a;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_f64 (float64x2_t __a)
- {
-   return (float32x4_t) __a;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_s8 (int8x16_t __a)
- {
-   return (float32x4_t) __a;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_s16 (int16x8_t __a)
- {
-   return (float32x4_t) __a;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_s32 (int32x4_t __a)
- {
-   return (float32x4_t) __a;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_s64 (int64x2_t __a)
- {
-   return (float32x4_t) __a;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_u8 (uint8x16_t __a)
- {
-   return (float32x4_t) __a;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_u16 (uint16x8_t __a)
- {
-   return (float32x4_t) __a;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_u32 (uint32x4_t __a)
- {
-   return (float32x4_t) __a;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_u64 (uint64x2_t __a)
- {
-   return (float32x4_t) __a;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_p8 (poly8x16_t __a)
- {
-   return (float32x4_t) __a;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_p16 (poly16x8_t __a)
- {
-   return (float32x4_t) __a;
- }
- 
--__extension__ static __inline float64x1_t __attribute__((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_f32_p64 (poly64x2_t __a)
-+{
-+  return (float32x4_t) __a;
-+}
-+
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_f32_p128 (poly128_t __a)
-+{
-+  return (float32x4_t)__a;
-+}
-+
-+
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f64_f16 (float16x4_t __a)
- {
-   return (float64x1_t) __a;
- }
- 
--__extension__ static __inline float64x1_t __attribute__((__always_inline__))
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f64_f32 (float32x2_t __a)
- {
-   return (float64x1_t) __a;
- }
- 
--__extension__ static __inline float64x1_t __attribute__((__always_inline__))
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f64_p8 (poly8x8_t __a)
- {
-   return (float64x1_t) __a;
- }
- 
--__extension__ static __inline float64x1_t __attribute__((__always_inline__))
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f64_p16 (poly16x4_t __a)
- {
-   return (float64x1_t) __a;
- }
- 
--__extension__ static __inline float64x1_t __attribute__((__always_inline__))
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_f64_p64 (poly64x1_t __a)
-+{
-+  return (float64x1_t) __a;
-+}
-+
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f64_s8 (int8x8_t __a)
- {
-   return (float64x1_t) __a;
- }
- 
--__extension__ static __inline float64x1_t __attribute__((__always_inline__))
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f64_s16 (int16x4_t __a)
- {
-   return (float64x1_t) __a;
- }
- 
--__extension__ static __inline float64x1_t __attribute__((__always_inline__))
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f64_s32 (int32x2_t __a)
- {
-   return (float64x1_t) __a;
- }
- 
--__extension__ static __inline float64x1_t __attribute__((__always_inline__))
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f64_s64 (int64x1_t __a)
- {
-   return (float64x1_t) __a;
- }
- 
--__extension__ static __inline float64x1_t __attribute__((__always_inline__))
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f64_u8 (uint8x8_t __a)
- {
-   return (float64x1_t) __a;
- }
- 
--__extension__ static __inline float64x1_t __attribute__((__always_inline__))
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f64_u16 (uint16x4_t __a)
- {
-   return (float64x1_t) __a;
- }
- 
--__extension__ static __inline float64x1_t __attribute__((__always_inline__))
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f64_u32 (uint32x2_t __a)
- {
-   return (float64x1_t) __a;
- }
- 
--__extension__ static __inline float64x1_t __attribute__((__always_inline__))
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f64_u64 (uint64x1_t __a)
- {
-   return (float64x1_t) __a;
- }
- 
--__extension__ static __inline float64x2_t __attribute__((__always_inline__))
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f64_f16 (float16x8_t __a)
- {
-   return (float64x2_t) __a;
- }
- 
--__extension__ static __inline float64x2_t __attribute__((__always_inline__))
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f64_f32 (float32x4_t __a)
- {
-   return (float64x2_t) __a;
- }
- 
--__extension__ static __inline float64x2_t __attribute__((__always_inline__))
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f64_p8 (poly8x16_t __a)
- {
-   return (float64x2_t) __a;
- }
- 
--__extension__ static __inline float64x2_t __attribute__((__always_inline__))
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f64_p16 (poly16x8_t __a)
- {
-   return (float64x2_t) __a;
- }
- 
--__extension__ static __inline float64x2_t __attribute__((__always_inline__))
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_f64_p64 (poly64x2_t __a)
-+{
-+  return (float64x2_t) __a;
-+}
-+
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f64_s8 (int8x16_t __a)
- {
-   return (float64x2_t) __a;
- }
- 
--__extension__ static __inline float64x2_t __attribute__((__always_inline__))
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f64_s16 (int16x8_t __a)
- {
-   return (float64x2_t) __a;
- }
- 
--__extension__ static __inline float64x2_t __attribute__((__always_inline__))
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f64_s32 (int32x4_t __a)
- {
-   return (float64x2_t) __a;
- }
- 
--__extension__ static __inline float64x2_t __attribute__((__always_inline__))
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f64_s64 (int64x2_t __a)
- {
-   return (float64x2_t) __a;
- }
- 
--__extension__ static __inline float64x2_t __attribute__((__always_inline__))
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f64_u8 (uint8x16_t __a)
- {
-   return (float64x2_t) __a;
- }
- 
--__extension__ static __inline float64x2_t __attribute__((__always_inline__))
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f64_u16 (uint16x8_t __a)
- {
-   return (float64x2_t) __a;
- }
- 
--__extension__ static __inline float64x2_t __attribute__((__always_inline__))
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f64_u32 (uint32x4_t __a)
- {
-   return (float64x2_t) __a;
- }
- 
--__extension__ static __inline float64x2_t __attribute__((__always_inline__))
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f64_u64 (uint64x2_t __a)
- {
-   return (float64x2_t) __a;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s64_f16 (float16x4_t __a)
- {
-   return (int64x1_t) __a;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s64_f64 (float64x1_t __a)
- {
-   return (int64x1_t) __a;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s64_s8 (int8x8_t __a)
- {
-   return (int64x1_t) __a;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s64_s16 (int16x4_t __a)
- {
-   return (int64x1_t) __a;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s64_s32 (int32x2_t __a)
- {
-   return (int64x1_t) __a;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s64_f32 (float32x2_t __a)
- {
-   return (int64x1_t) __a;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s64_u8 (uint8x8_t __a)
- {
-   return (int64x1_t) __a;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s64_u16 (uint16x4_t __a)
- {
-   return (int64x1_t) __a;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s64_u32 (uint32x2_t __a)
- {
-   return (int64x1_t) __a;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s64_u64 (uint64x1_t __a)
- {
-   return (int64x1_t) __a;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s64_p8 (poly8x8_t __a)
- {
-   return (int64x1_t) __a;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s64_p16 (poly16x4_t __a)
- {
-   return (int64x1_t) __a;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_s64_p64 (poly64x1_t __a)
-+{
-+  return (int64x1_t) __a;
-+}
-+
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_f64 (float64x2_t __a)
- {
-   return (int64x2_t) __a;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_s8 (int8x16_t __a)
- {
-   return (int64x2_t) __a;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_s16 (int16x8_t __a)
- {
-   return (int64x2_t) __a;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_s32 (int32x4_t __a)
- {
-   return (int64x2_t) __a;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_f16 (float16x8_t __a)
- {
-   return (int64x2_t) __a;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_f32 (float32x4_t __a)
- {
-   return (int64x2_t) __a;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_u8 (uint8x16_t __a)
- {
-   return (int64x2_t) __a;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_u16 (uint16x8_t __a)
- {
-   return (int64x2_t) __a;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_u32 (uint32x4_t __a)
- {
-   return (int64x2_t) __a;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_u64 (uint64x2_t __a)
- {
-   return (int64x2_t) __a;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_p8 (poly8x16_t __a)
- {
-   return (int64x2_t) __a;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_p16 (poly16x8_t __a)
- {
-   return (int64x2_t) __a;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_s64_p64 (poly64x2_t __a)
-+{
-+  return (int64x2_t) __a;
-+}
-+
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_s64_p128 (poly128_t __a)
-+{
-+  return (int64x2_t)__a;
-+}
-+
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u64_f16 (float16x4_t __a)
- {
-   return (uint64x1_t) __a;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u64_f64 (float64x1_t __a)
- {
-   return (uint64x1_t) __a;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u64_s8 (int8x8_t __a)
- {
-   return (uint64x1_t) __a;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u64_s16 (int16x4_t __a)
- {
-   return (uint64x1_t) __a;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u64_s32 (int32x2_t __a)
- {
-   return (uint64x1_t) __a;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u64_s64 (int64x1_t __a)
- {
-   return (uint64x1_t) __a;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u64_f32 (float32x2_t __a)
- {
-   return (uint64x1_t) __a;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u64_u8 (uint8x8_t __a)
- {
-   return (uint64x1_t) __a;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u64_u16 (uint16x4_t __a)
- {
-   return (uint64x1_t) __a;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u64_u32 (uint32x2_t __a)
- {
-   return (uint64x1_t) __a;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u64_p8 (poly8x8_t __a)
- {
-   return (uint64x1_t) __a;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u64_p16 (poly16x4_t __a)
- {
-   return (uint64x1_t) __a;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_u64_p64 (poly64x1_t __a)
-+{
-+  return (uint64x1_t) __a;
-+}
-+
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_f64 (float64x2_t __a)
- {
-   return (uint64x2_t) __a;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_s8 (int8x16_t __a)
- {
-   return (uint64x2_t) __a;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_s16 (int16x8_t __a)
- {
-   return (uint64x2_t) __a;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_s32 (int32x4_t __a)
- {
-   return (uint64x2_t) __a;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_s64 (int64x2_t __a)
- {
-   return (uint64x2_t) __a;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_f16 (float16x8_t __a)
- {
-   return (uint64x2_t) __a;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_f32 (float32x4_t __a)
- {
-   return (uint64x2_t) __a;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_u8 (uint8x16_t __a)
- {
-   return (uint64x2_t) __a;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_u16 (uint16x8_t __a)
- {
-   return (uint64x2_t) __a;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_u32 (uint32x4_t __a)
- {
-   return (uint64x2_t) __a;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_p8 (poly8x16_t __a)
- {
-   return (uint64x2_t) __a;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_p16 (poly16x8_t __a)
- {
-   return (uint64x2_t) __a;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_u64_p64 (poly64x2_t __a)
-+{
-+  return (uint64x2_t) __a;
-+}
-+
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_u64_p128 (poly128_t __a)
-+{
-+  return (uint64x2_t)__a;
-+}
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s8_f16 (float16x4_t __a)
- {
-   return (int8x8_t) __a;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s8_f64 (float64x1_t __a)
- {
-   return (int8x8_t) __a;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s8_s16 (int16x4_t __a)
- {
-   return (int8x8_t) __a;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s8_s32 (int32x2_t __a)
- {
-   return (int8x8_t) __a;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s8_s64 (int64x1_t __a)
- {
-   return (int8x8_t) __a;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s8_f32 (float32x2_t __a)
- {
-   return (int8x8_t) __a;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s8_u8 (uint8x8_t __a)
- {
-   return (int8x8_t) __a;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s8_u16 (uint16x4_t __a)
- {
-   return (int8x8_t) __a;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s8_u32 (uint32x2_t __a)
- {
-   return (int8x8_t) __a;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s8_u64 (uint64x1_t __a)
- {
-   return (int8x8_t) __a;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s8_p8 (poly8x8_t __a)
- {
-   return (int8x8_t) __a;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s8_p16 (poly16x4_t __a)
- {
-   return (int8x8_t) __a;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_s8_p64 (poly64x1_t __a)
-+{
-+  return (int8x8_t) __a;
-+}
-+
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_f64 (float64x2_t __a)
- {
-   return (int8x16_t) __a;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_s16 (int16x8_t __a)
- {
-   return (int8x16_t) __a;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_s32 (int32x4_t __a)
- {
-   return (int8x16_t) __a;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_s64 (int64x2_t __a)
- {
-   return (int8x16_t) __a;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_f16 (float16x8_t __a)
- {
-   return (int8x16_t) __a;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_f32 (float32x4_t __a)
- {
-   return (int8x16_t) __a;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_u8 (uint8x16_t __a)
- {
-   return (int8x16_t) __a;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_u16 (uint16x8_t __a)
- {
-   return (int8x16_t) __a;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_u32 (uint32x4_t __a)
- {
-   return (int8x16_t) __a;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_u64 (uint64x2_t __a)
- {
-   return (int8x16_t) __a;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_p8 (poly8x16_t __a)
- {
-   return (int8x16_t) __a;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_p16 (poly16x8_t __a)
- {
-   return (int8x16_t) __a;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_s8_p64 (poly64x2_t __a)
-+{
-+  return (int8x16_t) __a;
-+}
-+
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_s8_p128 (poly128_t __a)
-+{
-+  return (int8x16_t)__a;
-+}
-+
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s16_f16 (float16x4_t __a)
- {
-   return (int16x4_t) __a;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s16_f64 (float64x1_t __a)
- {
-   return (int16x4_t) __a;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s16_s8 (int8x8_t __a)
- {
-   return (int16x4_t) __a;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s16_s32 (int32x2_t __a)
- {
-   return (int16x4_t) __a;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s16_s64 (int64x1_t __a)
- {
-   return (int16x4_t) __a;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s16_f32 (float32x2_t __a)
- {
-   return (int16x4_t) __a;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s16_u8 (uint8x8_t __a)
- {
-   return (int16x4_t) __a;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s16_u16 (uint16x4_t __a)
- {
-   return (int16x4_t) __a;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s16_u32 (uint32x2_t __a)
- {
-   return (int16x4_t) __a;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s16_u64 (uint64x1_t __a)
- {
-   return (int16x4_t) __a;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s16_p8 (poly8x8_t __a)
- {
-   return (int16x4_t) __a;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s16_p16 (poly16x4_t __a)
- {
-   return (int16x4_t) __a;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_s16_p64 (poly64x1_t __a)
-+{
-+  return (int16x4_t) __a;
-+}
-+
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_f64 (float64x2_t __a)
- {
-   return (int16x8_t) __a;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_s8 (int8x16_t __a)
- {
-   return (int16x8_t) __a;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_s32 (int32x4_t __a)
- {
-   return (int16x8_t) __a;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_s64 (int64x2_t __a)
- {
-   return (int16x8_t) __a;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_f16 (float16x8_t __a)
- {
-   return (int16x8_t) __a;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_f32 (float32x4_t __a)
- {
-   return (int16x8_t) __a;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_u8 (uint8x16_t __a)
- {
-   return (int16x8_t) __a;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_u16 (uint16x8_t __a)
- {
-   return (int16x8_t) __a;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_u32 (uint32x4_t __a)
- {
-   return (int16x8_t) __a;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_u64 (uint64x2_t __a)
- {
-   return (int16x8_t) __a;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_p8 (poly8x16_t __a)
- {
-   return (int16x8_t) __a;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_p16 (poly16x8_t __a)
- {
-   return (int16x8_t) __a;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_s16_p64 (poly64x2_t __a)
-+{
-+  return (int16x8_t) __a;
-+}
-+
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_s16_p128 (poly128_t __a)
-+{
-+  return (int16x8_t)__a;
-+}
-+
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s32_f16 (float16x4_t __a)
- {
-   return (int32x2_t) __a;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s32_f64 (float64x1_t __a)
- {
-   return (int32x2_t) __a;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s32_s8 (int8x8_t __a)
- {
-   return (int32x2_t) __a;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s32_s16 (int16x4_t __a)
- {
-   return (int32x2_t) __a;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s32_s64 (int64x1_t __a)
- {
-   return (int32x2_t) __a;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s32_f32 (float32x2_t __a)
- {
-   return (int32x2_t) __a;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s32_u8 (uint8x8_t __a)
- {
-   return (int32x2_t) __a;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s32_u16 (uint16x4_t __a)
- {
-   return (int32x2_t) __a;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s32_u32 (uint32x2_t __a)
- {
-   return (int32x2_t) __a;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s32_u64 (uint64x1_t __a)
- {
-   return (int32x2_t) __a;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s32_p8 (poly8x8_t __a)
- {
-   return (int32x2_t) __a;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s32_p16 (poly16x4_t __a)
- {
-   return (int32x2_t) __a;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_s32_p64 (poly64x1_t __a)
-+{
-+  return (int32x2_t) __a;
-+}
-+
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_f64 (float64x2_t __a)
- {
-   return (int32x4_t) __a;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_s8 (int8x16_t __a)
- {
-   return (int32x4_t) __a;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_s16 (int16x8_t __a)
- {
-   return (int32x4_t) __a;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_s64 (int64x2_t __a)
- {
-   return (int32x4_t) __a;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_f16 (float16x8_t __a)
- {
-   return (int32x4_t) __a;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_f32 (float32x4_t __a)
- {
-   return (int32x4_t) __a;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_u8 (uint8x16_t __a)
- {
-   return (int32x4_t) __a;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_u16 (uint16x8_t __a)
- {
-   return (int32x4_t) __a;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_u32 (uint32x4_t __a)
- {
-   return (int32x4_t) __a;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_u64 (uint64x2_t __a)
- {
-   return (int32x4_t) __a;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_p8 (poly8x16_t __a)
- {
-   return (int32x4_t) __a;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_p16 (poly16x8_t __a)
- {
-   return (int32x4_t) __a;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_s32_p64 (poly64x2_t __a)
-+{
-+  return (int32x4_t) __a;
-+}
-+
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_s32_p128 (poly128_t __a)
-+{
-+  return (int32x4_t)__a;
-+}
-+
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u8_f16 (float16x4_t __a)
- {
-   return (uint8x8_t) __a;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u8_f64 (float64x1_t __a)
- {
-   return (uint8x8_t) __a;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u8_s8 (int8x8_t __a)
- {
-   return (uint8x8_t) __a;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u8_s16 (int16x4_t __a)
- {
-   return (uint8x8_t) __a;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u8_s32 (int32x2_t __a)
- {
-   return (uint8x8_t) __a;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u8_s64 (int64x1_t __a)
- {
-   return (uint8x8_t) __a;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u8_f32 (float32x2_t __a)
- {
-   return (uint8x8_t) __a;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u8_u16 (uint16x4_t __a)
- {
-   return (uint8x8_t) __a;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u8_u32 (uint32x2_t __a)
- {
-   return (uint8x8_t) __a;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u8_u64 (uint64x1_t __a)
- {
-   return (uint8x8_t) __a;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u8_p8 (poly8x8_t __a)
- {
-   return (uint8x8_t) __a;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u8_p16 (poly16x4_t __a)
- {
-   return (uint8x8_t) __a;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_u8_p64 (poly64x1_t __a)
-+{
-+  return (uint8x8_t) __a;
-+}
-+
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_f64 (float64x2_t __a)
- {
-   return (uint8x16_t) __a;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_s8 (int8x16_t __a)
- {
-   return (uint8x16_t) __a;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_s16 (int16x8_t __a)
- {
-   return (uint8x16_t) __a;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_s32 (int32x4_t __a)
- {
-   return (uint8x16_t) __a;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_s64 (int64x2_t __a)
- {
-   return (uint8x16_t) __a;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_f16 (float16x8_t __a)
- {
-   return (uint8x16_t) __a;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_f32 (float32x4_t __a)
- {
-   return (uint8x16_t) __a;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_u16 (uint16x8_t __a)
- {
-   return (uint8x16_t) __a;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_u32 (uint32x4_t __a)
- {
-   return (uint8x16_t) __a;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_u64 (uint64x2_t __a)
- {
-   return (uint8x16_t) __a;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_p8 (poly8x16_t __a)
- {
-   return (uint8x16_t) __a;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_p16 (poly16x8_t __a)
- {
-   return (uint8x16_t) __a;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_u8_p64 (poly64x2_t __a)
-+{
-+  return (uint8x16_t) __a;
-+}
-+
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_u8_p128 (poly128_t __a)
-+{
-+  return (uint8x16_t)__a;
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u16_f16 (float16x4_t __a)
- {
-   return (uint16x4_t) __a;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u16_f64 (float64x1_t __a)
- {
-   return (uint16x4_t) __a;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u16_s8 (int8x8_t __a)
- {
-   return (uint16x4_t) __a;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u16_s16 (int16x4_t __a)
- {
-   return (uint16x4_t) __a;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u16_s32 (int32x2_t __a)
- {
-   return (uint16x4_t) __a;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u16_s64 (int64x1_t __a)
- {
-   return (uint16x4_t) __a;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u16_f32 (float32x2_t __a)
- {
-   return (uint16x4_t) __a;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u16_u8 (uint8x8_t __a)
- {
-   return (uint16x4_t) __a;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u16_u32 (uint32x2_t __a)
- {
-   return (uint16x4_t) __a;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u16_u64 (uint64x1_t __a)
- {
-   return (uint16x4_t) __a;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u16_p8 (poly8x8_t __a)
- {
-   return (uint16x4_t) __a;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u16_p16 (poly16x4_t __a)
- {
-   return (uint16x4_t) __a;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_u16_p64 (poly64x1_t __a)
-+{
-+  return (uint16x4_t) __a;
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_f64 (float64x2_t __a)
- {
-   return (uint16x8_t) __a;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_s8 (int8x16_t __a)
- {
-   return (uint16x8_t) __a;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_s16 (int16x8_t __a)
- {
-   return (uint16x8_t) __a;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_s32 (int32x4_t __a)
- {
-   return (uint16x8_t) __a;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_s64 (int64x2_t __a)
- {
-   return (uint16x8_t) __a;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_f16 (float16x8_t __a)
- {
-   return (uint16x8_t) __a;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_f32 (float32x4_t __a)
- {
-   return (uint16x8_t) __a;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_u8 (uint8x16_t __a)
- {
-   return (uint16x8_t) __a;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_u32 (uint32x4_t __a)
- {
-   return (uint16x8_t) __a;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_u64 (uint64x2_t __a)
- {
-   return (uint16x8_t) __a;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_p8 (poly8x16_t __a)
- {
-   return (uint16x8_t) __a;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_p16 (poly16x8_t __a)
- {
-   return (uint16x8_t) __a;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_u16_p64 (poly64x2_t __a)
-+{
-+  return (uint16x8_t) __a;
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_u16_p128 (poly128_t __a)
-+{
-+  return (uint16x8_t)__a;
-+}
-+
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u32_f16 (float16x4_t __a)
- {
-   return (uint32x2_t) __a;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u32_f64 (float64x1_t __a)
- {
-   return (uint32x2_t) __a;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u32_s8 (int8x8_t __a)
- {
-   return (uint32x2_t) __a;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u32_s16 (int16x4_t __a)
- {
-   return (uint32x2_t) __a;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u32_s32 (int32x2_t __a)
- {
-   return (uint32x2_t) __a;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u32_s64 (int64x1_t __a)
- {
-   return (uint32x2_t) __a;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u32_f32 (float32x2_t __a)
- {
-   return (uint32x2_t) __a;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u32_u8 (uint8x8_t __a)
- {
-   return (uint32x2_t) __a;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u32_u16 (uint16x4_t __a)
- {
-   return (uint32x2_t) __a;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u32_u64 (uint64x1_t __a)
- {
-   return (uint32x2_t) __a;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u32_p8 (poly8x8_t __a)
- {
-   return (uint32x2_t) __a;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u32_p16 (poly16x4_t __a)
- {
-   return (uint32x2_t) __a;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpret_u32_p64 (poly64x1_t __a)
-+{
-+  return (uint32x2_t) __a;
-+}
-+
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_f64 (float64x2_t __a)
- {
-   return (uint32x4_t) __a;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_s8 (int8x16_t __a)
- {
-   return (uint32x4_t) __a;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_s16 (int16x8_t __a)
- {
-   return (uint32x4_t) __a;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_s32 (int32x4_t __a)
- {
-   return (uint32x4_t) __a;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_s64 (int64x2_t __a)
- {
-   return (uint32x4_t) __a;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_f16 (float16x8_t __a)
- {
-   return (uint32x4_t) __a;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_f32 (float32x4_t __a)
- {
-   return (uint32x4_t) __a;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_u8 (uint8x16_t __a)
- {
-   return (uint32x4_t) __a;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_u16 (uint16x8_t __a)
- {
-   return (uint32x4_t) __a;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_u64 (uint64x2_t __a)
- {
-   return (uint32x4_t) __a;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_p8 (poly8x16_t __a)
- {
-   return (uint32x4_t) __a;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_p16 (poly16x8_t __a)
- {
-   return (uint32x4_t) __a;
- }
- 
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_u32_p64 (poly64x2_t __a)
-+{
-+  return (uint32x4_t) __a;
-+}
-+
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vreinterpretq_u32_p128 (poly128_t __a)
-+{
-+  return (uint32x4_t)__a;
-+}
-+
- /* vset_lane  */
- 
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vset_lane_f16 (float16_t __elem, float16x4_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vset_lane_f32 (float32_t __elem, float32x2_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vset_lane_f64 (float64_t __elem, float64x1_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vset_lane_p8 (poly8_t __elem, poly8x8_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vset_lane_p16 (poly16_t __elem, poly16x4_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vset_lane_p64 (poly64_t __elem, poly64x1_t __vec, const int __index)
-+{
-+  return __aarch64_vset_lane_any (__elem, __vec, __index);
-+}
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vset_lane_s8 (int8_t __elem, int8x8_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vset_lane_s16 (int16_t __elem, int16x4_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vset_lane_s32 (int32_t __elem, int32x2_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vset_lane_s64 (int64_t __elem, int64x1_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vset_lane_u8 (uint8_t __elem, uint8x8_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vset_lane_u16 (uint16_t __elem, uint16x4_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vset_lane_u32 (uint32_t __elem, uint32x2_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vset_lane_u64 (uint64_t __elem, uint64x1_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
-@@ -4843,79 +6157,99 @@ vset_lane_u64 (uint64_t __elem, uint64x1_t __vec, const int __index)
- 
- /* vsetq_lane  */
- 
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsetq_lane_f16 (float16_t __elem, float16x8_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsetq_lane_f32 (float32_t __elem, float32x4_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsetq_lane_f64 (float64_t __elem, float64x2_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsetq_lane_p8 (poly8_t __elem, poly8x16_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsetq_lane_p16 (poly16_t __elem, poly16x8_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsetq_lane_p64 (poly64_t __elem, poly64x2_t __vec, const int __index)
-+{
-+  return __aarch64_vset_lane_any (__elem, __vec, __index);
-+}
-+
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsetq_lane_s8 (int8_t __elem, int8x16_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsetq_lane_s16 (int16_t __elem, int16x8_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsetq_lane_s32 (int32_t __elem, int32x4_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsetq_lane_s64 (int64_t __elem, int64x2_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsetq_lane_u8 (uint8_t __elem, uint8x16_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsetq_lane_u16 (uint16_t __elem, uint16x8_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsetq_lane_u32 (uint32_t __elem, uint32x4_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, const int __index)
- {
-   return __aarch64_vset_lane_any (__elem, __vec, __index);
-@@ -4926,79 +6260,99 @@ vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, const int __index)
-   uint64x1_t lo = vcreate_u64 (vgetq_lane_u64 (tmp, 0));  \
-   return vreinterpret_##__TYPE##_u64 (lo);
- 
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_f16 (float16x8_t __a)
- {
-   __GET_LOW (f16);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_f32 (float32x4_t __a)
- {
-   __GET_LOW (f32);
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_f64 (float64x2_t __a)
- {
-   return (float64x1_t) {vgetq_lane_f64 (__a, 0)};
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_p8 (poly8x16_t __a)
- {
-   __GET_LOW (p8);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_p16 (poly16x8_t __a)
- {
-   __GET_LOW (p16);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vget_low_p64 (poly64x2_t __a)
-+{
-+  __GET_LOW (p64);
-+}
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_s8 (int8x16_t __a)
- {
-   __GET_LOW (s8);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_s16 (int16x8_t __a)
- {
-   __GET_LOW (s16);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_s32 (int32x4_t __a)
- {
-   __GET_LOW (s32);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_s64 (int64x2_t __a)
- {
-   __GET_LOW (s64);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_u8 (uint8x16_t __a)
- {
-   __GET_LOW (u8);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_u16 (uint16x8_t __a)
- {
-   __GET_LOW (u16);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_u32 (uint32x4_t __a)
- {
-   __GET_LOW (u32);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_u64 (uint64x2_t __a)
- {
-   return vcreate_u64 (vgetq_lane_u64 (__a, 0));
-@@ -5011,73 +6365,92 @@ vget_low_u64 (uint64x2_t __a)
-   uint64x1_t hi = vcreate_u64 (vgetq_lane_u64 (tmp, 1));	\
-   return vreinterpret_##__TYPE##_u64 (hi);
- 
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_f16 (float16x8_t __a)
- {
-   __GET_HIGH (f16);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_f32 (float32x4_t __a)
- {
-   __GET_HIGH (f32);
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_f64 (float64x2_t __a)
- {
-   __GET_HIGH (f64);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_p8 (poly8x16_t __a)
- {
-   __GET_HIGH (p8);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_p16 (poly16x8_t __a)
- {
-   __GET_HIGH (p16);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vget_high_p64 (poly64x2_t __a)
-+{
-+  __GET_HIGH (p64);
-+}
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_s8 (int8x16_t __a)
- {
-   __GET_HIGH (s8);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_s16 (int16x8_t __a)
- {
-   __GET_HIGH (s16);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_s32 (int32x4_t __a)
- {
-   __GET_HIGH (s32);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_s64 (int64x2_t __a)
- {
-   __GET_HIGH (s64);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_u8 (uint8x16_t __a)
- {
-   __GET_HIGH (u8);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_u16 (uint16x8_t __a)
- {
-   __GET_HIGH (u16);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_u32 (uint32x4_t __a)
- {
-   __GET_HIGH (u32);
-@@ -5085,98 +6458,120 @@ vget_high_u32 (uint32x4_t __a)
- 
- #undef __GET_HIGH
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_u64 (uint64x2_t __a)
- {
-   return vcreate_u64 (vgetq_lane_u64 (__a, 1));
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int8x16_t) __builtin_aarch64_combinev8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x8_t) __builtin_aarch64_combinev4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x4_t) __builtin_aarch64_combinev2si (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_s64 (int64x1_t __a, int64x1_t __b)
- {
-   return __builtin_aarch64_combinedi (__a[0], __b[0]);
- }
- 
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_f16 (float16x4_t __a, float16x4_t __b)
- {
-   return __builtin_aarch64_combinev4hf (__a, __b);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_f32 (float32x2_t __a, float32x2_t __b)
- {
-   return (float32x4_t) __builtin_aarch64_combinev2sf (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint8x16_t) __builtin_aarch64_combinev8qi ((int8x8_t) __a,
- 						     (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint16x8_t) __builtin_aarch64_combinev4hi ((int16x4_t) __a,
- 						     (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint32x4_t) __builtin_aarch64_combinev2si ((int32x2_t) __a,
- 						     (int32x2_t) __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_u64 (uint64x1_t __a, uint64x1_t __b)
- {
-   return (uint64x2_t) __builtin_aarch64_combinedi (__a[0], __b[0]);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_f64 (float64x1_t __a, float64x1_t __b)
- {
-   return __builtin_aarch64_combinedf (__a[0], __b[0]);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_p8 (poly8x8_t __a, poly8x8_t __b)
- {
-   return (poly8x16_t) __builtin_aarch64_combinev8qi ((int8x8_t) __a,
- 						     (int8x8_t) __b);
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_p16 (poly16x4_t __a, poly16x4_t __b)
- {
-   return (poly16x8_t) __builtin_aarch64_combinev4hi ((int16x4_t) __a,
- 						     (int16x4_t) __b);
- }
- 
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcombine_p64 (poly64x1_t __a, poly64x1_t __b)
-+{
-+  return (poly64x2_t) __builtin_aarch64_combinedi_ppp (__a[0], __b[0]);
-+}
-+
- /* Start of temporary inline asm implementations.  */
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaba_s8 (int8x8_t a, int8x8_t b, int8x8_t c)
- {
-   int8x8_t result;
-@@ -5187,7 +6582,8 @@ vaba_s8 (int8x8_t a, int8x8_t b, int8x8_t c)
-   return result;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaba_s16 (int16x4_t a, int16x4_t b, int16x4_t c)
- {
-   int16x4_t result;
-@@ -5198,7 +6594,8 @@ vaba_s16 (int16x4_t a, int16x4_t b, int16x4_t c)
-   return result;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaba_s32 (int32x2_t a, int32x2_t b, int32x2_t c)
- {
-   int32x2_t result;
-@@ -5209,7 +6606,8 @@ vaba_s32 (int32x2_t a, int32x2_t b, int32x2_t c)
-   return result;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaba_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c)
- {
-   uint8x8_t result;
-@@ -5220,7 +6618,8 @@ vaba_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c)
-   return result;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaba_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c)
- {
-   uint16x4_t result;
-@@ -5231,7 +6630,8 @@ vaba_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c)
-   return result;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaba_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
- {
-   uint32x2_t result;
-@@ -5242,7 +6642,8 @@ vaba_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
-   return result;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabal_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
- {
-   int16x8_t result;
-@@ -5253,7 +6654,8 @@ vabal_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
-   return result;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabal_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
- {
-   int32x4_t result;
-@@ -5264,7 +6666,8 @@ vabal_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
-   return result;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabal_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
- {
-   int64x2_t result;
-@@ -5275,7 +6678,8 @@ vabal_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
-   return result;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabal_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
- {
-   uint16x8_t result;
-@@ -5286,7 +6690,8 @@ vabal_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
-   return result;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabal_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
- {
-   uint32x4_t result;
-@@ -5297,7 +6702,8 @@ vabal_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
-   return result;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
- {
-   uint64x2_t result;
-@@ -5308,7 +6714,8 @@ vabal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
-   return result;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabal_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
- {
-   int16x8_t result;
-@@ -5319,7 +6726,8 @@ vabal_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
-   return result;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabal_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
- {
-   int32x4_t result;
-@@ -5330,7 +6738,8 @@ vabal_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
-   return result;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabal_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
- {
-   int64x2_t result;
-@@ -5341,7 +6750,8 @@ vabal_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
-   return result;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabal_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
- {
-   uint16x8_t result;
-@@ -5352,7 +6762,8 @@ vabal_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
-   return result;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabal_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
- {
-   uint32x4_t result;
-@@ -5363,7 +6774,8 @@ vabal_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
-   return result;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
- {
-   uint64x2_t result;
-@@ -5374,7 +6786,8 @@ vabal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
-   return result;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabaq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
- {
-   int8x16_t result;
-@@ -5385,7 +6798,8 @@ vabaq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
-   return result;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabaq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
- {
-   int16x8_t result;
-@@ -5396,7 +6810,8 @@ vabaq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
-   return result;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabaq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
- {
-   int32x4_t result;
-@@ -5407,7 +6822,8 @@ vabaq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
-   return result;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
- {
-   uint8x16_t result;
-@@ -5418,7 +6834,8 @@ vabaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
-   return result;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
- {
-   uint16x8_t result;
-@@ -5429,7 +6846,8 @@ vabaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
-   return result;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
- {
-   uint32x4_t result;
-@@ -5440,18 +6858,8 @@ vabaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
-   return result;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vabd_f32 (float32x2_t a, float32x2_t b)
--{
--  float32x2_t result;
--  __asm__ ("fabd %0.2s, %1.2s, %2.2s"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
--
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabd_s8 (int8x8_t a, int8x8_t b)
- {
-   int8x8_t result;
-@@ -5462,7 +6870,8 @@ vabd_s8 (int8x8_t a, int8x8_t b)
-   return result;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabd_s16 (int16x4_t a, int16x4_t b)
- {
-   int16x4_t result;
-@@ -5473,7 +6882,8 @@ vabd_s16 (int16x4_t a, int16x4_t b)
-   return result;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabd_s32 (int32x2_t a, int32x2_t b)
- {
-   int32x2_t result;
-@@ -5484,7 +6894,8 @@ vabd_s32 (int32x2_t a, int32x2_t b)
-   return result;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabd_u8 (uint8x8_t a, uint8x8_t b)
- {
-   uint8x8_t result;
-@@ -5495,7 +6906,8 @@ vabd_u8 (uint8x8_t a, uint8x8_t b)
-   return result;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabd_u16 (uint16x4_t a, uint16x4_t b)
- {
-   uint16x4_t result;
-@@ -5506,7 +6918,8 @@ vabd_u16 (uint16x4_t a, uint16x4_t b)
-   return result;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabd_u32 (uint32x2_t a, uint32x2_t b)
- {
-   uint32x2_t result;
-@@ -5517,18 +6930,8 @@ vabd_u32 (uint32x2_t a, uint32x2_t b)
-   return result;
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vabdd_f64 (float64_t a, float64_t b)
--{
--  float64_t result;
--  __asm__ ("fabd %d0, %d1, %d2"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
--
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabdl_high_s8 (int8x16_t a, int8x16_t b)
- {
-   int16x8_t result;
-@@ -5539,7 +6942,8 @@ vabdl_high_s8 (int8x16_t a, int8x16_t b)
-   return result;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabdl_high_s16 (int16x8_t a, int16x8_t b)
- {
-   int32x4_t result;
-@@ -5550,7 +6954,8 @@ vabdl_high_s16 (int16x8_t a, int16x8_t b)
-   return result;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabdl_high_s32 (int32x4_t a, int32x4_t b)
- {
-   int64x2_t result;
-@@ -5561,7 +6966,8 @@ vabdl_high_s32 (int32x4_t a, int32x4_t b)
-   return result;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabdl_high_u8 (uint8x16_t a, uint8x16_t b)
- {
-   uint16x8_t result;
-@@ -5572,7 +6978,8 @@ vabdl_high_u8 (uint8x16_t a, uint8x16_t b)
-   return result;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabdl_high_u16 (uint16x8_t a, uint16x8_t b)
- {
-   uint32x4_t result;
-@@ -5583,7 +6990,8 @@ vabdl_high_u16 (uint16x8_t a, uint16x8_t b)
-   return result;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabdl_high_u32 (uint32x4_t a, uint32x4_t b)
- {
-   uint64x2_t result;
-@@ -5594,7 +7002,8 @@ vabdl_high_u32 (uint32x4_t a, uint32x4_t b)
-   return result;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabdl_s8 (int8x8_t a, int8x8_t b)
- {
-   int16x8_t result;
-@@ -5605,7 +7014,8 @@ vabdl_s8 (int8x8_t a, int8x8_t b)
-   return result;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabdl_s16 (int16x4_t a, int16x4_t b)
- {
-   int32x4_t result;
-@@ -5616,7 +7026,8 @@ vabdl_s16 (int16x4_t a, int16x4_t b)
-   return result;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabdl_s32 (int32x2_t a, int32x2_t b)
- {
-   int64x2_t result;
-@@ -5627,7 +7038,8 @@ vabdl_s32 (int32x2_t a, int32x2_t b)
-   return result;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabdl_u8 (uint8x8_t a, uint8x8_t b)
- {
-   uint16x8_t result;
-@@ -5638,7 +7050,8 @@ vabdl_u8 (uint8x8_t a, uint8x8_t b)
-   return result;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabdl_u16 (uint16x4_t a, uint16x4_t b)
- {
-   uint32x4_t result;
-@@ -5649,7 +7062,8 @@ vabdl_u16 (uint16x4_t a, uint16x4_t b)
-   return result;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabdl_u32 (uint32x2_t a, uint32x2_t b)
- {
-   uint64x2_t result;
-@@ -5660,29 +7074,8 @@ vabdl_u32 (uint32x2_t a, uint32x2_t b)
-   return result;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vabdq_f32 (float32x4_t a, float32x4_t b)
--{
--  float32x4_t result;
--  __asm__ ("fabd %0.4s, %1.4s, %2.4s"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
--
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vabdq_f64 (float64x2_t a, float64x2_t b)
--{
--  float64x2_t result;
--  __asm__ ("fabd %0.2d, %1.2d, %2.2d"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
--
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabdq_s8 (int8x16_t a, int8x16_t b)
- {
-   int8x16_t result;
-@@ -5693,7 +7086,8 @@ vabdq_s8 (int8x16_t a, int8x16_t b)
-   return result;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabdq_s16 (int16x8_t a, int16x8_t b)
- {
-   int16x8_t result;
-@@ -5704,7 +7098,8 @@ vabdq_s16 (int16x8_t a, int16x8_t b)
-   return result;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabdq_s32 (int32x4_t a, int32x4_t b)
- {
-   int32x4_t result;
-@@ -5715,7 +7110,8 @@ vabdq_s32 (int32x4_t a, int32x4_t b)
-   return result;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabdq_u8 (uint8x16_t a, uint8x16_t b)
- {
-   uint8x16_t result;
-@@ -5726,7 +7122,8 @@ vabdq_u8 (uint8x16_t a, uint8x16_t b)
-   return result;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabdq_u16 (uint16x8_t a, uint16x8_t b)
- {
-   uint16x8_t result;
-@@ -5737,7 +7134,8 @@ vabdq_u16 (uint16x8_t a, uint16x8_t b)
-   return result;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vabdq_u32 (uint32x4_t a, uint32x4_t b)
- {
-   uint32x4_t result;
-@@ -5748,18 +7146,8 @@ vabdq_u32 (uint32x4_t a, uint32x4_t b)
-   return result;
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vabds_f32 (float32_t a, float32_t b)
--{
--  float32_t result;
--  __asm__ ("fabd %s0, %s1, %s2"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
--
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddlv_s8 (int8x8_t a)
- {
-   int16_t result;
-@@ -5770,7 +7158,8 @@ vaddlv_s8 (int8x8_t a)
-   return result;
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddlv_s16 (int16x4_t a)
- {
-   int32_t result;
-@@ -5781,7 +7170,8 @@ vaddlv_s16 (int16x4_t a)
-   return result;
- }
- 
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddlv_u8 (uint8x8_t a)
- {
-   uint16_t result;
-@@ -5792,7 +7182,8 @@ vaddlv_u8 (uint8x8_t a)
-   return result;
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddlv_u16 (uint16x4_t a)
- {
-   uint32_t result;
-@@ -5803,7 +7194,8 @@ vaddlv_u16 (uint16x4_t a)
-   return result;
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddlvq_s8 (int8x16_t a)
- {
-   int16_t result;
-@@ -5814,7 +7206,8 @@ vaddlvq_s8 (int8x16_t a)
-   return result;
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddlvq_s16 (int16x8_t a)
- {
-   int32_t result;
-@@ -5825,7 +7218,8 @@ vaddlvq_s16 (int16x8_t a)
-   return result;
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddlvq_s32 (int32x4_t a)
- {
-   int64_t result;
-@@ -5836,7 +7230,8 @@ vaddlvq_s32 (int32x4_t a)
-   return result;
- }
- 
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddlvq_u8 (uint8x16_t a)
- {
-   uint16_t result;
-@@ -5847,7 +7242,8 @@ vaddlvq_u8 (uint8x16_t a)
-   return result;
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddlvq_u16 (uint16x8_t a)
- {
-   uint32_t result;
-@@ -5858,7 +7254,8 @@ vaddlvq_u16 (uint16x8_t a)
-   return result;
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
- vaddlvq_u32 (uint32x4_t a)
- {
-   uint64_t result;
-@@ -5869,18584 +7266,23100 @@ vaddlvq_u32 (uint32x4_t a)
-   return result;
- }
- 
--#define vcopyq_lane_f32(a, b, c, d)                                     \
--  __extension__                                                         \
--    ({                                                                  \
--       float32x4_t c_ = (c);                                            \
--       float32x4_t a_ = (a);                                            \
--       float32x4_t result;                                              \
--       __asm__ ("ins %0.s[%2], %3.s[%4]"                                \
--                : "=w"(result)                                          \
--                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtx_f32_f64 (float64x2_t a)
-+{
-+  float32x2_t result;
-+  __asm__ ("fcvtxn %0.2s,%1.2d"
-+           : "=w"(result)
-+           : "w"(a)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtx_high_f32_f64 (float32x2_t a, float64x2_t b)
-+{
-+  float32x4_t result;
-+  __asm__ ("fcvtxn2 %0.4s,%1.2d"
-+           : "=w"(result)
-+           : "w" (b), "0"(a)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtxd_f32_f64 (float64_t a)
-+{
-+  float32_t result;
-+  __asm__ ("fcvtxn %s0,%d1"
-+           : "=w"(result)
-+           : "w"(a)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
-+{
-+  float32x2_t result;
-+  float32x2_t t1;
-+  __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fadd %0.2s, %0.2s, %1.2s"
-+           : "=w"(result), "=w"(t1)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_n_s16 (int16x4_t a, int16x4_t b, int16_t c)
-+{
-+  int16x4_t result;
-+  __asm__ ("mla %0.4h,%2.4h,%3.h[0]"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "x"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_n_s32 (int32x2_t a, int32x2_t b, int32_t c)
-+{
-+  int32x2_t result;
-+  __asm__ ("mla %0.2s,%2.2s,%3.s[0]"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_n_u16 (uint16x4_t a, uint16x4_t b, uint16_t c)
-+{
-+  uint16x4_t result;
-+  __asm__ ("mla %0.4h,%2.4h,%3.h[0]"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "x"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_n_u32 (uint32x2_t a, uint32x2_t b, uint32_t c)
-+{
-+  uint32x2_t result;
-+  __asm__ ("mla %0.2s,%2.2s,%3.s[0]"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_s8 (int8x8_t a, int8x8_t b, int8x8_t c)
-+{
-+  int8x8_t result;
-+  __asm__ ("mla %0.8b, %2.8b, %3.8b"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_s16 (int16x4_t a, int16x4_t b, int16x4_t c)
-+{
-+  int16x4_t result;
-+  __asm__ ("mla %0.4h, %2.4h, %3.4h"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_s32 (int32x2_t a, int32x2_t b, int32x2_t c)
-+{
-+  int32x2_t result;
-+  __asm__ ("mla %0.2s, %2.2s, %3.2s"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c)
-+{
-+  uint8x8_t result;
-+  __asm__ ("mla %0.8b, %2.8b, %3.8b"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c)
-+{
-+  uint16x4_t result;
-+  __asm__ ("mla %0.4h, %2.4h, %3.4h"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
-+{
-+  uint32x2_t result;
-+  __asm__ ("mla %0.2s, %2.2s, %3.2s"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
- 
--#define vcopyq_lane_f64(a, b, c, d)                                     \
-+#define vmlal_high_lane_s16(a, b, c, d)                                 \
-   __extension__                                                         \
-     ({                                                                  \
--       float64x2_t c_ = (c);                                            \
--       float64x2_t a_ = (a);                                            \
--       float64x2_t result;                                              \
--       __asm__ ("ins %0.d[%2], %3.d[%4]"                                \
-+       int16x4_t c_ = (c);                                              \
-+       int16x8_t b_ = (b);                                              \
-+       int32x4_t a_ = (a);                                              \
-+       int32x4_t result;                                                \
-+       __asm__ ("smlal2 %0.4s, %2.8h, %3.h[%4]"                         \
-                 : "=w"(result)                                          \
--                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
-+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vcopyq_lane_p8(a, b, c, d)                                      \
-+#define vmlal_high_lane_s32(a, b, c, d)                                 \
-   __extension__                                                         \
-     ({                                                                  \
--       poly8x16_t c_ = (c);                                             \
--       poly8x16_t a_ = (a);                                             \
--       poly8x16_t result;                                               \
--       __asm__ ("ins %0.b[%2], %3.b[%4]"                                \
-+       int32x2_t c_ = (c);                                              \
-+       int32x4_t b_ = (b);                                              \
-+       int64x2_t a_ = (a);                                              \
-+       int64x2_t result;                                                \
-+       __asm__ ("smlal2 %0.2d, %2.4s, %3.s[%4]"                         \
-                 : "=w"(result)                                          \
--                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
-+                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vcopyq_lane_p16(a, b, c, d)                                     \
-+#define vmlal_high_lane_u16(a, b, c, d)                                 \
-   __extension__                                                         \
-     ({                                                                  \
--       poly16x8_t c_ = (c);                                             \
--       poly16x8_t a_ = (a);                                             \
--       poly16x8_t result;                                               \
--       __asm__ ("ins %0.h[%2], %3.h[%4]"                                \
-+       uint16x4_t c_ = (c);                                             \
-+       uint16x8_t b_ = (b);                                             \
-+       uint32x4_t a_ = (a);                                             \
-+       uint32x4_t result;                                               \
-+       __asm__ ("umlal2 %0.4s, %2.8h, %3.h[%4]"                         \
-                 : "=w"(result)                                          \
--                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
-+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vcopyq_lane_s8(a, b, c, d)                                      \
-+#define vmlal_high_lane_u32(a, b, c, d)                                 \
-   __extension__                                                         \
-     ({                                                                  \
--       int8x16_t c_ = (c);                                              \
--       int8x16_t a_ = (a);                                              \
--       int8x16_t result;                                                \
--       __asm__ ("ins %0.b[%2], %3.b[%4]"                                \
-+       uint32x2_t c_ = (c);                                             \
-+       uint32x4_t b_ = (b);                                             \
-+       uint64x2_t a_ = (a);                                             \
-+       uint64x2_t result;                                               \
-+       __asm__ ("umlal2 %0.2d, %2.4s, %3.s[%4]"                         \
-                 : "=w"(result)                                          \
--                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
-+                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vcopyq_lane_s16(a, b, c, d)                                     \
-+#define vmlal_high_laneq_s16(a, b, c, d)                                \
-   __extension__                                                         \
-     ({                                                                  \
-        int16x8_t c_ = (c);                                              \
--       int16x8_t a_ = (a);                                              \
--       int16x8_t result;                                                \
--       __asm__ ("ins %0.h[%2], %3.h[%4]"                                \
--                : "=w"(result)                                          \
--                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
--
--#define vcopyq_lane_s32(a, b, c, d)                                     \
--  __extension__                                                         \
--    ({                                                                  \
--       int32x4_t c_ = (c);                                              \
-+       int16x8_t b_ = (b);                                              \
-        int32x4_t a_ = (a);                                              \
-        int32x4_t result;                                                \
--       __asm__ ("ins %0.s[%2], %3.s[%4]"                                \
-+       __asm__ ("smlal2 %0.4s, %2.8h, %3.h[%4]"                         \
-                 : "=w"(result)                                          \
--                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
-+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vcopyq_lane_s64(a, b, c, d)                                     \
-+#define vmlal_high_laneq_s32(a, b, c, d)                                \
-   __extension__                                                         \
-     ({                                                                  \
--       int64x2_t c_ = (c);                                              \
-+       int32x4_t c_ = (c);                                              \
-+       int32x4_t b_ = (b);                                              \
-        int64x2_t a_ = (a);                                              \
-        int64x2_t result;                                                \
--       __asm__ ("ins %0.d[%2], %3.d[%4]"                                \
--                : "=w"(result)                                          \
--                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
--
--#define vcopyq_lane_u8(a, b, c, d)                                      \
--  __extension__                                                         \
--    ({                                                                  \
--       uint8x16_t c_ = (c);                                             \
--       uint8x16_t a_ = (a);                                             \
--       uint8x16_t result;                                               \
--       __asm__ ("ins %0.b[%2], %3.b[%4]"                                \
-+       __asm__ ("smlal2 %0.2d, %2.4s, %3.s[%4]"                         \
-                 : "=w"(result)                                          \
--                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
-+                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vcopyq_lane_u16(a, b, c, d)                                     \
-+#define vmlal_high_laneq_u16(a, b, c, d)                                \
-   __extension__                                                         \
-     ({                                                                  \
-        uint16x8_t c_ = (c);                                             \
--       uint16x8_t a_ = (a);                                             \
--       uint16x8_t result;                                               \
--       __asm__ ("ins %0.h[%2], %3.h[%4]"                                \
--                : "=w"(result)                                          \
--                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
--
--#define vcopyq_lane_u32(a, b, c, d)                                     \
--  __extension__                                                         \
--    ({                                                                  \
--       uint32x4_t c_ = (c);                                             \
-+       uint16x8_t b_ = (b);                                             \
-        uint32x4_t a_ = (a);                                             \
-        uint32x4_t result;                                               \
--       __asm__ ("ins %0.s[%2], %3.s[%4]"                                \
-+       __asm__ ("umlal2 %0.4s, %2.8h, %3.h[%4]"                         \
-                 : "=w"(result)                                          \
--                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
-+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vcopyq_lane_u64(a, b, c, d)                                     \
-+#define vmlal_high_laneq_u32(a, b, c, d)                                \
-   __extension__                                                         \
-     ({                                                                  \
--       uint64x2_t c_ = (c);                                             \
-+       uint32x4_t c_ = (c);                                             \
-+       uint32x4_t b_ = (b);                                             \
-        uint64x2_t a_ = (a);                                             \
-        uint64x2_t result;                                               \
--       __asm__ ("ins %0.d[%2], %3.d[%4]"                                \
-+       __asm__ ("umlal2 %0.2d, %2.4s, %3.s[%4]"                         \
-                 : "=w"(result)                                          \
--                : "0"(a_), "i"(b), "w"(c_), "i"(d)                      \
-+                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vcvt_n_f32_s32(a, b)                                            \
--  __extension__                                                         \
--    ({                                                                  \
--       int32x2_t a_ = (a);                                              \
--       float32x2_t result;                                              \
--       __asm__ ("scvtf %0.2s, %1.2s, #%2"                               \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlal_high_n_s16 (int32x4_t a, int16x8_t b, int16_t c)
-+{
-+  int32x4_t result;
-+  __asm__ ("smlal2 %0.4s,%2.8h,%3.h[0]"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "x"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
- 
--#define vcvt_n_f32_u32(a, b)                                            \
--  __extension__                                                         \
--    ({                                                                  \
--       uint32x2_t a_ = (a);                                             \
--       float32x2_t result;                                              \
--       __asm__ ("ucvtf %0.2s, %1.2s, #%2"                               \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlal_high_n_s32 (int64x2_t a, int32x4_t b, int32_t c)
-+{
-+  int64x2_t result;
-+  __asm__ ("smlal2 %0.2d,%2.4s,%3.s[0]"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
- 
--#define vcvt_n_s32_f32(a, b)                                            \
--  __extension__                                                         \
--    ({                                                                  \
--       float32x2_t a_ = (a);                                            \
--       int32x2_t result;                                                \
--       __asm__ ("fcvtzs %0.2s, %1.2s, #%2"                              \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlal_high_n_u16 (uint32x4_t a, uint16x8_t b, uint16_t c)
-+{
-+  uint32x4_t result;
-+  __asm__ ("umlal2 %0.4s,%2.8h,%3.h[0]"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "x"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
- 
--#define vcvt_n_u32_f32(a, b)                                            \
--  __extension__                                                         \
--    ({                                                                  \
--       float32x2_t a_ = (a);                                            \
--       uint32x2_t result;                                               \
--       __asm__ ("fcvtzu %0.2s, %1.2s, #%2"                              \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlal_high_n_u32 (uint64x2_t a, uint32x4_t b, uint32_t c)
-+{
-+  uint64x2_t result;
-+  __asm__ ("umlal2 %0.2d,%2.4s,%3.s[0]"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
- 
--#define vcvtd_n_f64_s64(a, b)                                           \
--  __extension__                                                         \
--    ({                                                                  \
--       int64_t a_ = (a);                                                \
--       float64_t result;                                                \
--       __asm__ ("scvtf %d0,%d1,%2"                                      \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlal_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
-+{
-+  int16x8_t result;
-+  __asm__ ("smlal2 %0.8h,%2.16b,%3.16b"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
- 
--#define vcvtd_n_f64_u64(a, b)                                           \
--  __extension__                                                         \
--    ({                                                                  \
--       uint64_t a_ = (a);                                               \
--       float64_t result;                                                \
--       __asm__ ("ucvtf %d0,%d1,%2"                                      \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlal_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
-+{
-+  int32x4_t result;
-+  __asm__ ("smlal2 %0.4s,%2.8h,%3.8h"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
- 
--#define vcvtd_n_s64_f64(a, b)                                           \
--  __extension__                                                         \
--    ({                                                                  \
--       float64_t a_ = (a);                                              \
--       int64_t result;                                                  \
--       __asm__ ("fcvtzs %d0,%d1,%2"                                     \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlal_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
-+{
-+  int64x2_t result;
-+  __asm__ ("smlal2 %0.2d,%2.4s,%3.4s"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
- 
--#define vcvtd_n_u64_f64(a, b)                                           \
--  __extension__                                                         \
--    ({                                                                  \
--       float64_t a_ = (a);                                              \
--       uint64_t result;                                                 \
--       __asm__ ("fcvtzu %d0,%d1,%2"                                     \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlal_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
-+{
-+  uint16x8_t result;
-+  __asm__ ("umlal2 %0.8h,%2.16b,%3.16b"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlal_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
-+{
-+  uint32x4_t result;
-+  __asm__ ("umlal2 %0.4s,%2.8h,%3.8h"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
-+{
-+  uint64x2_t result;
-+  __asm__ ("umlal2 %0.2d,%2.4s,%3.4s"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
- 
--#define vcvtq_n_f32_s32(a, b)                                           \
-+#define vmlal_lane_s16(a, b, c, d)                                      \
-   __extension__                                                         \
-     ({                                                                  \
-+       int16x4_t c_ = (c);                                              \
-+       int16x4_t b_ = (b);                                              \
-        int32x4_t a_ = (a);                                              \
--       float32x4_t result;                                              \
--       __asm__ ("scvtf %0.4s, %1.4s, #%2"                               \
-+       int32x4_t result;                                                \
-+       __asm__ ("smlal %0.4s,%2.4h,%3.h[%4]"                            \
-                 : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
-+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vcvtq_n_f32_u32(a, b)                                           \
-+#define vmlal_lane_s32(a, b, c, d)                                      \
-   __extension__                                                         \
-     ({                                                                  \
--       uint32x4_t a_ = (a);                                             \
--       float32x4_t result;                                              \
--       __asm__ ("ucvtf %0.4s, %1.4s, #%2"                               \
-+       int32x2_t c_ = (c);                                              \
-+       int32x2_t b_ = (b);                                              \
-+       int64x2_t a_ = (a);                                              \
-+       int64x2_t result;                                                \
-+       __asm__ ("smlal %0.2d,%2.2s,%3.s[%4]"                            \
-                 : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
-+                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vcvtq_n_f64_s64(a, b)                                           \
-+#define vmlal_lane_u16(a, b, c, d)                                      \
-   __extension__                                                         \
-     ({                                                                  \
--       int64x2_t a_ = (a);                                              \
--       float64x2_t result;                                              \
--       __asm__ ("scvtf %0.2d, %1.2d, #%2"                               \
-+       uint16x4_t c_ = (c);                                             \
-+       uint16x4_t b_ = (b);                                             \
-+       uint32x4_t a_ = (a);                                             \
-+       uint32x4_t result;                                               \
-+       __asm__ ("umlal %0.4s,%2.4h,%3.h[%4]"                            \
-                 : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
-+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vcvtq_n_f64_u64(a, b)                                           \
-+#define vmlal_lane_u32(a, b, c, d)                                      \
-   __extension__                                                         \
-     ({                                                                  \
-+       uint32x2_t c_ = (c);                                             \
-+       uint32x2_t b_ = (b);                                             \
-        uint64x2_t a_ = (a);                                             \
--       float64x2_t result;                                              \
--       __asm__ ("ucvtf %0.2d, %1.2d, #%2"                               \
-+       uint64x2_t result;                                               \
-+       __asm__ ("umlal %0.2d, %2.2s, %3.s[%4]"                          \
-                 : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
-+                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vcvtq_n_s32_f32(a, b)                                           \
-+#define vmlal_laneq_s16(a, b, c, d)                                     \
-   __extension__                                                         \
-     ({                                                                  \
--       float32x4_t a_ = (a);                                            \
-+       int16x8_t c_ = (c);                                              \
-+       int16x4_t b_ = (b);                                              \
-+       int32x4_t a_ = (a);                                              \
-        int32x4_t result;                                                \
--       __asm__ ("fcvtzs %0.4s, %1.4s, #%2"                              \
-+       __asm__ ("smlal %0.4s, %2.4h, %3.h[%4]"                          \
-                 : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
-+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vcvtq_n_s64_f64(a, b)                                           \
-+#define vmlal_laneq_s32(a, b, c, d)                                     \
-   __extension__                                                         \
-     ({                                                                  \
--       float64x2_t a_ = (a);                                            \
-+       int32x4_t c_ = (c);                                              \
-+       int32x2_t b_ = (b);                                              \
-+       int64x2_t a_ = (a);                                              \
-        int64x2_t result;                                                \
--       __asm__ ("fcvtzs %0.2d, %1.2d, #%2"                              \
-+       __asm__ ("smlal %0.2d, %2.2s, %3.s[%4]"                          \
-                 : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
-+                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vcvtq_n_u32_f32(a, b)                                           \
-+#define vmlal_laneq_u16(a, b, c, d)                                     \
-   __extension__                                                         \
-     ({                                                                  \
--       float32x4_t a_ = (a);                                            \
-+       uint16x8_t c_ = (c);                                             \
-+       uint16x4_t b_ = (b);                                             \
-+       uint32x4_t a_ = (a);                                             \
-        uint32x4_t result;                                               \
--       __asm__ ("fcvtzu %0.4s, %1.4s, #%2"                              \
-+       __asm__ ("umlal %0.4s, %2.4h, %3.h[%4]"                          \
-                 : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
-+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vcvtq_n_u64_f64(a, b)                                           \
-+#define vmlal_laneq_u32(a, b, c, d)                                     \
-   __extension__                                                         \
-     ({                                                                  \
--       float64x2_t a_ = (a);                                            \
--       uint64x2_t result;                                               \
--       __asm__ ("fcvtzu %0.2d, %1.2d, #%2"                              \
-+       uint32x4_t c_ = (c);                                             \
-+       uint32x2_t b_ = (b);                                             \
-+       uint64x2_t a_ = (a);                                             \
-+       uint64x2_t result;                                               \
-+       __asm__ ("umlal %0.2d, %2.2s, %3.s[%4]"                          \
-                 : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
-+                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vcvts_n_f32_s32(a, b)                                           \
--  __extension__                                                         \
--    ({                                                                  \
--       int32_t a_ = (a);                                                \
--       float32_t result;                                                \
--       __asm__ ("scvtf %s0,%s1,%2"                                      \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlal_n_s16 (int32x4_t a, int16x4_t b, int16_t c)
-+{
-+  int32x4_t result;
-+  __asm__ ("smlal %0.4s,%2.4h,%3.h[0]"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "x"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
- 
--#define vcvts_n_f32_u32(a, b)                                           \
--  __extension__                                                         \
--    ({                                                                  \
--       uint32_t a_ = (a);                                               \
--       float32_t result;                                                \
--       __asm__ ("ucvtf %s0,%s1,%2"                                      \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlal_n_s32 (int64x2_t a, int32x2_t b, int32_t c)
-+{
-+  int64x2_t result;
-+  __asm__ ("smlal %0.2d,%2.2s,%3.s[0]"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
- 
--#define vcvts_n_s32_f32(a, b)                                           \
--  __extension__                                                         \
--    ({                                                                  \
--       float32_t a_ = (a);                                              \
--       int32_t result;                                                  \
--       __asm__ ("fcvtzs %s0,%s1,%2"                                     \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlal_n_u16 (uint32x4_t a, uint16x4_t b, uint16_t c)
-+{
-+  uint32x4_t result;
-+  __asm__ ("umlal %0.4s,%2.4h,%3.h[0]"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "x"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
- 
--#define vcvts_n_u32_f32(a, b)                                           \
--  __extension__                                                         \
--    ({                                                                  \
--       float32_t a_ = (a);                                              \
--       uint32_t result;                                                 \
--       __asm__ ("fcvtzu %s0,%s1,%2"                                     \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlal_n_u32 (uint64x2_t a, uint32x2_t b, uint32_t c)
-+{
-+  uint64x2_t result;
-+  __asm__ ("umlal %0.2d,%2.2s,%3.s[0]"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vcvtx_f32_f64 (float64x2_t a)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlal_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
- {
--  float32x2_t result;
--  __asm__ ("fcvtxn %0.2s,%1.2d"
-+  int16x8_t result;
-+  __asm__ ("smlal %0.8h,%2.8b,%3.8b"
-            : "=w"(result)
--           : "w"(a)
-+           : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vcvtx_high_f32_f64 (float32x2_t a, float64x2_t b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlal_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
- {
--  float32x4_t result;
--  __asm__ ("fcvtxn2 %0.4s,%1.2d"
-+  int32x4_t result;
-+  __asm__ ("smlal %0.4s,%2.4h,%3.4h"
-            : "=w"(result)
--           : "w" (b), "0"(a)
-+           : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vcvtxd_f32_f64 (float64_t a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlal_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
- {
--  float32_t result;
--  __asm__ ("fcvtxn %s0,%d1"
-+  int64x2_t result;
-+  __asm__ ("smlal %0.2d,%2.2s,%3.2s"
-            : "=w"(result)
--           : "w"(a)
-+           : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vmla_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlal_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
- {
--  float32x2_t result;
--  float32x2_t t1;
--  __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fadd %0.2s, %0.2s, %1.2s"
-+  uint16x8_t result;
-+  __asm__ ("umlal %0.8h,%2.8b,%3.8b"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlal_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
-+{
-+  uint32x4_t result;
-+  __asm__ ("umlal %0.4s,%2.4h,%3.4h"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
-+{
-+  uint64x2_t result;
-+  __asm__ ("umlal %0.2d,%2.2s,%3.2s"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
-+{
-+  float32x4_t result;
-+  float32x4_t t1;
-+  __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fadd %0.4s, %0.4s, %1.4s"
-            : "=w"(result), "=w"(t1)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vmla_n_s16 (int16x4_t a, int16x4_t b, int16_t c)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_n_s16 (int16x8_t a, int16x8_t b, int16_t c)
- {
--  int16x4_t result;
--  __asm__ ("mla %0.4h,%2.4h,%3.h[0]"
-+  int16x8_t result;
-+  __asm__ ("mla %0.8h,%2.8h,%3.h[0]"
-            : "=w"(result)
-            : "0"(a), "w"(b), "x"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vmla_n_s32 (int32x2_t a, int32x2_t b, int32_t c)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_n_s32 (int32x4_t a, int32x4_t b, int32_t c)
- {
--  int32x2_t result;
--  __asm__ ("mla %0.2s,%2.2s,%3.s[0]"
-+  int32x4_t result;
-+  __asm__ ("mla %0.4s,%2.4s,%3.s[0]"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vmla_n_u16 (uint16x4_t a, uint16x4_t b, uint16_t c)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c)
- {
--  uint16x4_t result;
--  __asm__ ("mla %0.4h,%2.4h,%3.h[0]"
-+  uint16x8_t result;
-+  __asm__ ("mla %0.8h,%2.8h,%3.h[0]"
-            : "=w"(result)
-            : "0"(a), "w"(b), "x"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vmla_n_u32 (uint32x2_t a, uint32x2_t b, uint32_t c)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_n_u32 (uint32x4_t a, uint32x4_t b, uint32_t c)
- {
--  uint32x2_t result;
--  __asm__ ("mla %0.2s,%2.2s,%3.s[0]"
-+  uint32x4_t result;
-+  __asm__ ("mla %0.4s,%2.4s,%3.s[0]"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vmla_s8 (int8x8_t a, int8x8_t b, int8x8_t c)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
- {
--  int8x8_t result;
--  __asm__ ("mla %0.8b, %2.8b, %3.8b"
-+  int8x16_t result;
-+  __asm__ ("mla %0.16b, %2.16b, %3.16b"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vmla_s16 (int16x4_t a, int16x4_t b, int16x4_t c)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
- {
--  int16x4_t result;
--  __asm__ ("mla %0.4h, %2.4h, %3.4h"
-+  int16x8_t result;
-+  __asm__ ("mla %0.8h, %2.8h, %3.8h"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vmla_s32 (int32x2_t a, int32x2_t b, int32x2_t c)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
- {
--  int32x2_t result;
--  __asm__ ("mla %0.2s, %2.2s, %3.2s"
-+  int32x4_t result;
-+  __asm__ ("mla %0.4s, %2.4s, %3.4s"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vmla_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
- {
--  uint8x8_t result;
--  __asm__ ("mla %0.8b, %2.8b, %3.8b"
-+  uint8x16_t result;
-+  __asm__ ("mla %0.16b, %2.16b, %3.16b"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vmla_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
- {
--  uint16x4_t result;
--  __asm__ ("mla %0.4h, %2.4h, %3.4h"
-+  uint16x8_t result;
-+  __asm__ ("mla %0.8h, %2.8h, %3.8h"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vmla_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
- {
--  uint32x2_t result;
--  __asm__ ("mla %0.2s, %2.2s, %3.2s"
-+  uint32x4_t result;
-+  __asm__ ("mla %0.4s, %2.4s, %3.4s"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--#define vmlal_high_lane_s16(a, b, c, d)                                 \
--  __extension__                                                         \
--    ({                                                                  \
--       int16x4_t c_ = (c);                                              \
--       int16x8_t b_ = (b);                                              \
--       int32x4_t a_ = (a);                                              \
--       int32x4_t result;                                                \
--       __asm__ ("smlal2 %0.4s, %2.8h, %3.h[%4]"                         \
--                : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
--
--#define vmlal_high_lane_s32(a, b, c, d)                                 \
--  __extension__                                                         \
--    ({                                                                  \
--       int32x2_t c_ = (c);                                              \
--       int32x4_t b_ = (b);                                              \
--       int64x2_t a_ = (a);                                              \
--       int64x2_t result;                                                \
--       __asm__ ("smlal2 %0.2d, %2.4s, %3.s[%4]"                         \
--                : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
-+{
-+  float32x2_t result;
-+  float32x2_t t1;
-+  __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fsub %0.2s, %0.2s, %1.2s"
-+           : "=w"(result), "=w"(t1)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
- 
--#define vmlal_high_lane_u16(a, b, c, d)                                 \
--  __extension__                                                         \
--    ({                                                                  \
--       uint16x4_t c_ = (c);                                             \
--       uint16x8_t b_ = (b);                                             \
--       uint32x4_t a_ = (a);                                             \
--       uint32x4_t result;                                               \
--       __asm__ ("umlal2 %0.4s, %2.8h, %3.h[%4]"                         \
--                : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_n_s16 (int16x4_t a, int16x4_t b, int16_t c)
-+{
-+  int16x4_t result;
-+  __asm__ ("mls %0.4h, %2.4h, %3.h[0]"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "x"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
- 
--#define vmlal_high_lane_u32(a, b, c, d)                                 \
--  __extension__                                                         \
--    ({                                                                  \
--       uint32x2_t c_ = (c);                                             \
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_n_s32 (int32x2_t a, int32x2_t b, int32_t c)
-+{
-+  int32x2_t result;
-+  __asm__ ("mls %0.2s, %2.2s, %3.s[0]"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_n_u16 (uint16x4_t a, uint16x4_t b, uint16_t c)
-+{
-+  uint16x4_t result;
-+  __asm__ ("mls %0.4h, %2.4h, %3.h[0]"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "x"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_n_u32 (uint32x2_t a, uint32x2_t b, uint32_t c)
-+{
-+  uint32x2_t result;
-+  __asm__ ("mls %0.2s, %2.2s, %3.s[0]"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_s8 (int8x8_t a, int8x8_t b, int8x8_t c)
-+{
-+  int8x8_t result;
-+  __asm__ ("mls %0.8b,%2.8b,%3.8b"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_s16 (int16x4_t a, int16x4_t b, int16x4_t c)
-+{
-+  int16x4_t result;
-+  __asm__ ("mls %0.4h,%2.4h,%3.4h"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_s32 (int32x2_t a, int32x2_t b, int32x2_t c)
-+{
-+  int32x2_t result;
-+  __asm__ ("mls %0.2s,%2.2s,%3.2s"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c)
-+{
-+  uint8x8_t result;
-+  __asm__ ("mls %0.8b,%2.8b,%3.8b"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c)
-+{
-+  uint16x4_t result;
-+  __asm__ ("mls %0.4h,%2.4h,%3.4h"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
-+{
-+  uint32x2_t result;
-+  __asm__ ("mls %0.2s,%2.2s,%3.2s"
-+           : "=w"(result)
-+           : "0"(a), "w"(b), "w"(c)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+#define vmlsl_high_lane_s16(a, b, c, d)                                 \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int16x4_t c_ = (c);                                              \
-+       int16x8_t b_ = (b);                                              \
-+       int32x4_t a_ = (a);                                              \
-+       int32x4_t result;                                                \
-+       __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
-+                : "=w"(result)                                          \
-+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vmlsl_high_lane_s32(a, b, c, d)                                 \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int32x2_t c_ = (c);                                              \
-+       int32x4_t b_ = (b);                                              \
-+       int64x2_t a_ = (a);                                              \
-+       int64x2_t result;                                                \
-+       __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
-+                : "=w"(result)                                          \
-+                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vmlsl_high_lane_u16(a, b, c, d)                                 \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       uint16x4_t c_ = (c);                                             \
-+       uint16x8_t b_ = (b);                                             \
-+       uint32x4_t a_ = (a);                                             \
-+       uint32x4_t result;                                               \
-+       __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
-+                : "=w"(result)                                          \
-+                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vmlsl_high_lane_u32(a, b, c, d)                                 \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       uint32x2_t c_ = (c);                                             \
-        uint32x4_t b_ = (b);                                             \
-        uint64x2_t a_ = (a);                                             \
-        uint64x2_t result;                                               \
--       __asm__ ("umlal2 %0.2d, %2.4s, %3.s[%4]"                         \
-+       __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
-                 : "=w"(result)                                          \
-                 : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlal_high_laneq_s16(a, b, c, d)                                \
-+#define vmlsl_high_laneq_s16(a, b, c, d)                                \
-   __extension__                                                         \
-     ({                                                                  \
-        int16x8_t c_ = (c);                                              \
-        int16x8_t b_ = (b);                                              \
-        int32x4_t a_ = (a);                                              \
-        int32x4_t result;                                                \
--       __asm__ ("smlal2 %0.4s, %2.8h, %3.h[%4]"                         \
-+       __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
-                 : "=w"(result)                                          \
-                 : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlal_high_laneq_s32(a, b, c, d)                                \
-+#define vmlsl_high_laneq_s32(a, b, c, d)                                \
-   __extension__                                                         \
-     ({                                                                  \
-        int32x4_t c_ = (c);                                              \
-        int32x4_t b_ = (b);                                              \
-        int64x2_t a_ = (a);                                              \
-        int64x2_t result;                                                \
--       __asm__ ("smlal2 %0.2d, %2.4s, %3.s[%4]"                         \
-+       __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
-                 : "=w"(result)                                          \
-                 : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlal_high_laneq_u16(a, b, c, d)                                \
-+#define vmlsl_high_laneq_u16(a, b, c, d)                                \
-   __extension__                                                         \
-     ({                                                                  \
-        uint16x8_t c_ = (c);                                             \
-        uint16x8_t b_ = (b);                                             \
-        uint32x4_t a_ = (a);                                             \
-        uint32x4_t result;                                               \
--       __asm__ ("umlal2 %0.4s, %2.8h, %3.h[%4]"                         \
-+       __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
-                 : "=w"(result)                                          \
-                 : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlal_high_laneq_u32(a, b, c, d)                                \
-+#define vmlsl_high_laneq_u32(a, b, c, d)                                \
-   __extension__                                                         \
-     ({                                                                  \
-        uint32x4_t c_ = (c);                                             \
-        uint32x4_t b_ = (b);                                             \
-        uint64x2_t a_ = (a);                                             \
-        uint64x2_t result;                                               \
--       __asm__ ("umlal2 %0.2d, %2.4s, %3.s[%4]"                         \
-+       __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
-                 : "=w"(result)                                          \
-                 : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmlal_high_n_s16 (int32x4_t a, int16x8_t b, int16_t c)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsl_high_n_s16 (int32x4_t a, int16x8_t b, int16_t c)
- {
-   int32x4_t result;
--  __asm__ ("smlal2 %0.4s,%2.8h,%3.h[0]"
-+  __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[0]"
-            : "=w"(result)
-            : "0"(a), "w"(b), "x"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vmlal_high_n_s32 (int64x2_t a, int32x4_t b, int32_t c)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsl_high_n_s32 (int64x2_t a, int32x4_t b, int32_t c)
- {
-   int64x2_t result;
--  __asm__ ("smlal2 %0.2d,%2.4s,%3.s[0]"
-+  __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[0]"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmlal_high_n_u16 (uint32x4_t a, uint16x8_t b, uint16_t c)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsl_high_n_u16 (uint32x4_t a, uint16x8_t b, uint16_t c)
- {
-   uint32x4_t result;
--  __asm__ ("umlal2 %0.4s,%2.8h,%3.h[0]"
-+  __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[0]"
-            : "=w"(result)
-            : "0"(a), "w"(b), "x"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vmlal_high_n_u32 (uint64x2_t a, uint32x4_t b, uint32_t c)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsl_high_n_u32 (uint64x2_t a, uint32x4_t b, uint32_t c)
- {
-   uint64x2_t result;
--  __asm__ ("umlal2 %0.2d,%2.4s,%3.s[0]"
-+  __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[0]"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmlal_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsl_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
- {
-   int16x8_t result;
--  __asm__ ("smlal2 %0.8h,%2.16b,%3.16b"
-+  __asm__ ("smlsl2 %0.8h,%2.16b,%3.16b"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmlal_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsl_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
- {
-   int32x4_t result;
--  __asm__ ("smlal2 %0.4s,%2.8h,%3.8h"
-+  __asm__ ("smlsl2 %0.4s,%2.8h,%3.8h"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vmlal_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsl_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
- {
-   int64x2_t result;
--  __asm__ ("smlal2 %0.2d,%2.4s,%3.4s"
-+  __asm__ ("smlsl2 %0.2d,%2.4s,%3.4s"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmlal_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsl_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
- {
-   uint16x8_t result;
--  __asm__ ("umlal2 %0.8h,%2.16b,%3.16b"
-+  __asm__ ("umlsl2 %0.8h,%2.16b,%3.16b"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmlal_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsl_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
- {
-   uint32x4_t result;
--  __asm__ ("umlal2 %0.4s,%2.8h,%3.8h"
-+  __asm__ ("umlsl2 %0.4s,%2.8h,%3.8h"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vmlal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsl_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
- {
-   uint64x2_t result;
--  __asm__ ("umlal2 %0.2d,%2.4s,%3.4s"
-+  __asm__ ("umlsl2 %0.2d,%2.4s,%3.4s"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--#define vmlal_lane_s16(a, b, c, d)                                      \
-+#define vmlsl_lane_s16(a, b, c, d)                                      \
-   __extension__                                                         \
-     ({                                                                  \
-        int16x4_t c_ = (c);                                              \
-        int16x4_t b_ = (b);                                              \
-        int32x4_t a_ = (a);                                              \
-        int32x4_t result;                                                \
--       __asm__ ("smlal %0.4s,%2.4h,%3.h[%4]"                            \
-+       __asm__ ("smlsl %0.4s, %2.4h, %3.h[%4]"                          \
-                 : "=w"(result)                                          \
-                 : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlal_lane_s32(a, b, c, d)                                      \
-+#define vmlsl_lane_s32(a, b, c, d)                                      \
-   __extension__                                                         \
-     ({                                                                  \
-        int32x2_t c_ = (c);                                              \
-        int32x2_t b_ = (b);                                              \
-        int64x2_t a_ = (a);                                              \
-        int64x2_t result;                                                \
--       __asm__ ("smlal %0.2d,%2.2s,%3.s[%4]"                            \
-+       __asm__ ("smlsl %0.2d, %2.2s, %3.s[%4]"                          \
-                 : "=w"(result)                                          \
-                 : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlal_lane_u16(a, b, c, d)                                      \
-+#define vmlsl_lane_u16(a, b, c, d)                                      \
-   __extension__                                                         \
-     ({                                                                  \
-        uint16x4_t c_ = (c);                                             \
-        uint16x4_t b_ = (b);                                             \
-        uint32x4_t a_ = (a);                                             \
-        uint32x4_t result;                                               \
--       __asm__ ("umlal %0.4s,%2.4h,%3.h[%4]"                            \
-+       __asm__ ("umlsl %0.4s, %2.4h, %3.h[%4]"                          \
-                 : "=w"(result)                                          \
-                 : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlal_lane_u32(a, b, c, d)                                      \
-+#define vmlsl_lane_u32(a, b, c, d)                                      \
-   __extension__                                                         \
-     ({                                                                  \
-        uint32x2_t c_ = (c);                                             \
-        uint32x2_t b_ = (b);                                             \
-        uint64x2_t a_ = (a);                                             \
-        uint64x2_t result;                                               \
--       __asm__ ("umlal %0.2d, %2.2s, %3.s[%4]"                          \
-+       __asm__ ("umlsl %0.2d, %2.2s, %3.s[%4]"                          \
-                 : "=w"(result)                                          \
-                 : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlal_laneq_s16(a, b, c, d)                                     \
-+#define vmlsl_laneq_s16(a, b, c, d)                                     \
-   __extension__                                                         \
-     ({                                                                  \
-        int16x8_t c_ = (c);                                              \
-        int16x4_t b_ = (b);                                              \
-        int32x4_t a_ = (a);                                              \
-        int32x4_t result;                                                \
--       __asm__ ("smlal %0.4s, %2.4h, %3.h[%4]"                          \
-+       __asm__ ("smlsl %0.4s, %2.4h, %3.h[%4]"                          \
-                 : "=w"(result)                                          \
-                 : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlal_laneq_s32(a, b, c, d)                                     \
-+#define vmlsl_laneq_s32(a, b, c, d)                                     \
-   __extension__                                                         \
-     ({                                                                  \
-        int32x4_t c_ = (c);                                              \
-        int32x2_t b_ = (b);                                              \
-        int64x2_t a_ = (a);                                              \
-        int64x2_t result;                                                \
--       __asm__ ("smlal %0.2d, %2.2s, %3.s[%4]"                          \
-+       __asm__ ("smlsl %0.2d, %2.2s, %3.s[%4]"                          \
-                 : "=w"(result)                                          \
-                 : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlal_laneq_u16(a, b, c, d)                                     \
-+#define vmlsl_laneq_u16(a, b, c, d)                                     \
-   __extension__                                                         \
-     ({                                                                  \
-        uint16x8_t c_ = (c);                                             \
-        uint16x4_t b_ = (b);                                             \
-        uint32x4_t a_ = (a);                                             \
-        uint32x4_t result;                                               \
--       __asm__ ("umlal %0.4s, %2.4h, %3.h[%4]"                          \
-+       __asm__ ("umlsl %0.4s, %2.4h, %3.h[%4]"                          \
-                 : "=w"(result)                                          \
-                 : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlal_laneq_u32(a, b, c, d)                                     \
-+#define vmlsl_laneq_u32(a, b, c, d)                                     \
-   __extension__                                                         \
-     ({                                                                  \
-        uint32x4_t c_ = (c);                                             \
-        uint32x2_t b_ = (b);                                             \
-        uint64x2_t a_ = (a);                                             \
-        uint64x2_t result;                                               \
--       __asm__ ("umlal %0.2d, %2.2s, %3.s[%4]"                          \
-+       __asm__ ("umlsl %0.2d, %2.2s, %3.s[%4]"                          \
-                 : "=w"(result)                                          \
-                 : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmlal_n_s16 (int32x4_t a, int16x4_t b, int16_t c)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsl_n_s16 (int32x4_t a, int16x4_t b, int16_t c)
- {
-   int32x4_t result;
--  __asm__ ("smlal %0.4s,%2.4h,%3.h[0]"
-+  __asm__ ("smlsl %0.4s, %2.4h, %3.h[0]"
-            : "=w"(result)
-            : "0"(a), "w"(b), "x"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vmlal_n_s32 (int64x2_t a, int32x2_t b, int32_t c)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsl_n_s32 (int64x2_t a, int32x2_t b, int32_t c)
- {
-   int64x2_t result;
--  __asm__ ("smlal %0.2d,%2.2s,%3.s[0]"
-+  __asm__ ("smlsl %0.2d, %2.2s, %3.s[0]"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmlal_n_u16 (uint32x4_t a, uint16x4_t b, uint16_t c)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsl_n_u16 (uint32x4_t a, uint16x4_t b, uint16_t c)
- {
-   uint32x4_t result;
--  __asm__ ("umlal %0.4s,%2.4h,%3.h[0]"
-+  __asm__ ("umlsl %0.4s, %2.4h, %3.h[0]"
-            : "=w"(result)
-            : "0"(a), "w"(b), "x"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vmlal_n_u32 (uint64x2_t a, uint32x2_t b, uint32_t c)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsl_n_u32 (uint64x2_t a, uint32x2_t b, uint32_t c)
- {
-   uint64x2_t result;
--  __asm__ ("umlal %0.2d,%2.2s,%3.s[0]"
-+  __asm__ ("umlsl %0.2d, %2.2s, %3.s[0]"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmlal_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsl_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
- {
-   int16x8_t result;
--  __asm__ ("smlal %0.8h,%2.8b,%3.8b"
-+  __asm__ ("smlsl %0.8h, %2.8b, %3.8b"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmlal_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsl_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
- {
-   int32x4_t result;
--  __asm__ ("smlal %0.4s,%2.4h,%3.4h"
-+  __asm__ ("smlsl %0.4s, %2.4h, %3.4h"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vmlal_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsl_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
- {
-   int64x2_t result;
--  __asm__ ("smlal %0.2d,%2.2s,%3.2s"
-+  __asm__ ("smlsl %0.2d, %2.2s, %3.2s"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmlal_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsl_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
- {
-   uint16x8_t result;
--  __asm__ ("umlal %0.8h,%2.8b,%3.8b"
-+  __asm__ ("umlsl %0.8h, %2.8b, %3.8b"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmlal_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsl_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
- {
-   uint32x4_t result;
--  __asm__ ("umlal %0.4s,%2.4h,%3.4h"
-+  __asm__ ("umlsl %0.4s, %2.4h, %3.4h"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vmlal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsl_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
- {
-   uint64x2_t result;
--  __asm__ ("umlal %0.2d,%2.2s,%3.2s"
-+  __asm__ ("umlsl %0.2d, %2.2s, %3.2s"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vmlaq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
- {
-   float32x4_t result;
-   float32x4_t t1;
--  __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fadd %0.4s, %0.4s, %1.4s"
-+  __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fsub %0.4s, %0.4s, %1.4s"
-            : "=w"(result), "=w"(t1)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmlaq_n_s16 (int16x8_t a, int16x8_t b, int16_t c)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_n_s16 (int16x8_t a, int16x8_t b, int16_t c)
- {
-   int16x8_t result;
--  __asm__ ("mla %0.8h,%2.8h,%3.h[0]"
-+  __asm__ ("mls %0.8h, %2.8h, %3.h[0]"
-            : "=w"(result)
-            : "0"(a), "w"(b), "x"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmlaq_n_s32 (int32x4_t a, int32x4_t b, int32_t c)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_n_s32 (int32x4_t a, int32x4_t b, int32_t c)
- {
-   int32x4_t result;
--  __asm__ ("mla %0.4s,%2.4s,%3.s[0]"
-+  __asm__ ("mls %0.4s, %2.4s, %3.s[0]"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmlaq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c)
- {
-   uint16x8_t result;
--  __asm__ ("mla %0.8h,%2.8h,%3.h[0]"
-+  __asm__ ("mls %0.8h, %2.8h, %3.h[0]"
-            : "=w"(result)
-            : "0"(a), "w"(b), "x"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmlaq_n_u32 (uint32x4_t a, uint32x4_t b, uint32_t c)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_n_u32 (uint32x4_t a, uint32x4_t b, uint32_t c)
- {
-   uint32x4_t result;
--  __asm__ ("mla %0.4s,%2.4s,%3.s[0]"
-+  __asm__ ("mls %0.4s, %2.4s, %3.s[0]"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vmlaq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
- {
-   int8x16_t result;
--  __asm__ ("mla %0.16b, %2.16b, %3.16b"
-+  __asm__ ("mls %0.16b,%2.16b,%3.16b"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmlaq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
- {
-   int16x8_t result;
--  __asm__ ("mla %0.8h, %2.8h, %3.8h"
-+  __asm__ ("mls %0.8h,%2.8h,%3.8h"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmlaq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
- {
-   int32x4_t result;
--  __asm__ ("mla %0.4s, %2.4s, %3.4s"
-+  __asm__ ("mls %0.4s,%2.4s,%3.4s"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vmlaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
- {
-   uint8x16_t result;
--  __asm__ ("mla %0.16b, %2.16b, %3.16b"
-+  __asm__ ("mls %0.16b,%2.16b,%3.16b"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmlaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
- {
-   uint16x8_t result;
--  __asm__ ("mla %0.8h, %2.8h, %3.8h"
-+  __asm__ ("mls %0.8h,%2.8h,%3.8h"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmlaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
- {
-   uint32x4_t result;
--  __asm__ ("mla %0.4s, %2.4s, %3.4s"
-+  __asm__ ("mls %0.4s,%2.4s,%3.4s"
-            : "=w"(result)
-            : "0"(a), "w"(b), "w"(c)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vmls_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovl_high_s8 (int8x16_t a)
- {
--  float32x2_t result;
--  float32x2_t t1;
--  __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fsub %0.2s, %0.2s, %1.2s"
--           : "=w"(result), "=w"(t1)
--           : "0"(a), "w"(b), "w"(c)
-+  int16x8_t result;
-+  __asm__ ("sshll2 %0.8h,%1.16b,#0"
-+           : "=w"(result)
-+           : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vmls_n_s16 (int16x4_t a, int16x4_t b, int16_t c)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovl_high_s16 (int16x8_t a)
- {
--  int16x4_t result;
--  __asm__ ("mls %0.4h, %2.4h, %3.h[0]"
-+  int32x4_t result;
-+  __asm__ ("sshll2 %0.4s,%1.8h,#0"
-            : "=w"(result)
--           : "0"(a), "w"(b), "x"(c)
-+           : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vmls_n_s32 (int32x2_t a, int32x2_t b, int32_t c)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovl_high_s32 (int32x4_t a)
- {
--  int32x2_t result;
--  __asm__ ("mls %0.2s, %2.2s, %3.s[0]"
-+  int64x2_t result;
-+  __asm__ ("sshll2 %0.2d,%1.4s,#0"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vmls_n_u16 (uint16x4_t a, uint16x4_t b, uint16_t c)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovl_high_u8 (uint8x16_t a)
- {
--  uint16x4_t result;
--  __asm__ ("mls %0.4h, %2.4h, %3.h[0]"
-+  uint16x8_t result;
-+  __asm__ ("ushll2 %0.8h,%1.16b,#0"
-            : "=w"(result)
--           : "0"(a), "w"(b), "x"(c)
-+           : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vmls_n_u32 (uint32x2_t a, uint32x2_t b, uint32_t c)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovl_high_u16 (uint16x8_t a)
- {
--  uint32x2_t result;
--  __asm__ ("mls %0.2s, %2.2s, %3.s[0]"
-+  uint32x4_t result;
-+  __asm__ ("ushll2 %0.4s,%1.8h,#0"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vmls_s8 (int8x8_t a, int8x8_t b, int8x8_t c)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovl_high_u32 (uint32x4_t a)
- {
--  int8x8_t result;
--  __asm__ ("mls %0.8b,%2.8b,%3.8b"
-+  uint64x2_t result;
-+  __asm__ ("ushll2 %0.2d,%1.4s,#0"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vmls_s16 (int16x4_t a, int16x4_t b, int16x4_t c)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovl_s8 (int8x8_t a)
- {
--  int16x4_t result;
--  __asm__ ("mls %0.4h,%2.4h,%3.4h"
-+  int16x8_t result;
-+  __asm__ ("sshll %0.8h,%1.8b,#0"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vmls_s32 (int32x2_t a, int32x2_t b, int32x2_t c)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovl_s16 (int16x4_t a)
- {
--  int32x2_t result;
--  __asm__ ("mls %0.2s,%2.2s,%3.2s"
-+  int32x4_t result;
-+  __asm__ ("sshll %0.4s,%1.4h,#0"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vmls_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovl_s32 (int32x2_t a)
- {
--  uint8x8_t result;
--  __asm__ ("mls %0.8b,%2.8b,%3.8b"
-+  int64x2_t result;
-+  __asm__ ("sshll %0.2d,%1.2s,#0"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vmls_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovl_u8 (uint8x8_t a)
- {
--  uint16x4_t result;
--  __asm__ ("mls %0.4h,%2.4h,%3.4h"
-+  uint16x8_t result;
-+  __asm__ ("ushll %0.8h,%1.8b,#0"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vmls_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovl_u16 (uint16x4_t a)
- {
--  uint32x2_t result;
--  __asm__ ("mls %0.2s,%2.2s,%3.2s"
-+  uint32x4_t result;
-+  __asm__ ("ushll %0.4s,%1.4h,#0"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--#define vmlsl_high_lane_s16(a, b, c, d)                                 \
--  __extension__                                                         \
--    ({                                                                  \
--       int16x4_t c_ = (c);                                              \
--       int16x8_t b_ = (b);                                              \
--       int32x4_t a_ = (a);                                              \
--       int32x4_t result;                                                \
--       __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
--                : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovl_u32 (uint32x2_t a)
-+{
-+  uint64x2_t result;
-+  __asm__ ("ushll %0.2d,%1.2s,#0"
-+           : "=w"(result)
-+           : "w"(a)
-+           : /* No clobbers */);
-+  return result;
-+}
- 
--#define vmlsl_high_lane_s32(a, b, c, d)                                 \
--  __extension__                                                         \
--    ({                                                                  \
--       int32x2_t c_ = (c);                                              \
--       int32x4_t b_ = (b);                                              \
--       int64x2_t a_ = (a);                                              \
--       int64x2_t result;                                                \
--       __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
--                : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovn_high_s16 (int8x8_t a, int16x8_t b)
-+{
-+  int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("xtn2 %0.16b,%1.8h"
-+           : "+w"(result)
-+           : "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
- 
--#define vmlsl_high_lane_u16(a, b, c, d)                                 \
--  __extension__                                                         \
--    ({                                                                  \
--       uint16x4_t c_ = (c);                                             \
--       uint16x8_t b_ = (b);                                             \
--       uint32x4_t a_ = (a);                                             \
--       uint32x4_t result;                                               \
--       __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
--                : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovn_high_s32 (int16x4_t a, int32x4_t b)
-+{
-+  int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("xtn2 %0.8h,%1.4s"
-+           : "+w"(result)
-+           : "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
- 
--#define vmlsl_high_lane_u32(a, b, c, d)                                 \
--  __extension__                                                         \
--    ({                                                                  \
--       uint32x2_t c_ = (c);                                             \
--       uint32x4_t b_ = (b);                                             \
--       uint64x2_t a_ = (a);                                             \
--       uint64x2_t result;                                               \
--       __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
--                : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovn_high_s64 (int32x2_t a, int64x2_t b)
-+{
-+  int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("xtn2 %0.4s,%1.2d"
-+           : "+w"(result)
-+           : "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
- 
--#define vmlsl_high_laneq_s16(a, b, c, d)                                \
--  __extension__                                                         \
--    ({                                                                  \
--       int16x8_t c_ = (c);                                              \
--       int16x8_t b_ = (b);                                              \
--       int32x4_t a_ = (a);                                              \
--       int32x4_t result;                                                \
--       __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
--                : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovn_high_u16 (uint8x8_t a, uint16x8_t b)
-+{
-+  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("xtn2 %0.16b,%1.8h"
-+           : "+w"(result)
-+           : "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
- 
--#define vmlsl_high_laneq_s32(a, b, c, d)                                \
--  __extension__                                                         \
--    ({                                                                  \
--       int32x4_t c_ = (c);                                              \
--       int32x4_t b_ = (b);                                              \
--       int64x2_t a_ = (a);                                              \
--       int64x2_t result;                                                \
--       __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
--                : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
--
--#define vmlsl_high_laneq_u16(a, b, c, d)                                \
--  __extension__                                                         \
--    ({                                                                  \
--       uint16x8_t c_ = (c);                                             \
--       uint16x8_t b_ = (b);                                             \
--       uint32x4_t a_ = (a);                                             \
--       uint32x4_t result;                                               \
--       __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[%4]"                         \
--                : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
--
--#define vmlsl_high_laneq_u32(a, b, c, d)                                \
--  __extension__                                                         \
--    ({                                                                  \
--       uint32x4_t c_ = (c);                                             \
--       uint32x4_t b_ = (b);                                             \
--       uint64x2_t a_ = (a);                                             \
--       uint64x2_t result;                                               \
--       __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[%4]"                         \
--                : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
--
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmlsl_high_n_s16 (int32x4_t a, int16x8_t b, int16_t c)
--{
--  int32x4_t result;
--  __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[0]"
--           : "=w"(result)
--           : "0"(a), "w"(b), "x"(c)
--           : /* No clobbers */);
--  return result;
--}
--
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vmlsl_high_n_s32 (int64x2_t a, int32x4_t b, int32_t c)
--{
--  int64x2_t result;
--  __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[0]"
--           : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
--           : /* No clobbers */);
--  return result;
--}
--
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmlsl_high_n_u16 (uint32x4_t a, uint16x8_t b, uint16_t c)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovn_high_u32 (uint16x4_t a, uint32x4_t b)
- {
--  uint32x4_t result;
--  __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[0]"
--           : "=w"(result)
--           : "0"(a), "w"(b), "x"(c)
-+  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("xtn2 %0.8h,%1.4s"
-+           : "+w"(result)
-+           : "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vmlsl_high_n_u32 (uint64x2_t a, uint32x4_t b, uint32_t c)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovn_high_u64 (uint32x2_t a, uint64x2_t b)
- {
--  uint64x2_t result;
--  __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[0]"
--           : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("xtn2 %0.4s,%1.2d"
-+           : "+w"(result)
-+           : "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmlsl_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovn_s16 (int16x8_t a)
- {
--  int16x8_t result;
--  __asm__ ("smlsl2 %0.8h,%2.16b,%3.16b"
-+  int8x8_t result;
-+  __asm__ ("xtn %0.8b,%1.8h"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmlsl_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovn_s32 (int32x4_t a)
- {
--  int32x4_t result;
--  __asm__ ("smlsl2 %0.4s,%2.8h,%3.8h"
-+  int16x4_t result;
-+  __asm__ ("xtn %0.4h,%1.4s"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vmlsl_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovn_s64 (int64x2_t a)
- {
--  int64x2_t result;
--  __asm__ ("smlsl2 %0.2d,%2.4s,%3.4s"
-+  int32x2_t result;
-+  __asm__ ("xtn %0.2s,%1.2d"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmlsl_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovn_u16 (uint16x8_t a)
- {
--  uint16x8_t result;
--  __asm__ ("umlsl2 %0.8h,%2.16b,%3.16b"
-+  uint8x8_t result;
-+  __asm__ ("xtn %0.8b,%1.8h"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmlsl_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovn_u32 (uint32x4_t a)
- {
--  uint32x4_t result;
--  __asm__ ("umlsl2 %0.4s,%2.8h,%3.8h"
-+  uint16x4_t result;
-+  __asm__ ("xtn %0.4h,%1.4s"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vmlsl_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovn_u64 (uint64x2_t a)
- {
--  uint64x2_t result;
--  __asm__ ("umlsl2 %0.2d,%2.4s,%3.4s"
-+  uint32x2_t result;
-+  __asm__ ("xtn %0.2s,%1.2d"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--#define vmlsl_lane_s16(a, b, c, d)                                      \
-+#define vmull_high_lane_s16(a, b, c)                                    \
-   __extension__                                                         \
-     ({                                                                  \
--       int16x4_t c_ = (c);                                              \
-        int16x4_t b_ = (b);                                              \
--       int32x4_t a_ = (a);                                              \
-+       int16x8_t a_ = (a);                                              \
-        int32x4_t result;                                                \
--       __asm__ ("smlsl %0.4s, %2.4h, %3.h[%4]"                          \
-+       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                 : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-+                : "w"(a_), "x"(b_), "i"(c)                              \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlsl_lane_s32(a, b, c, d)                                      \
-+#define vmull_high_lane_s32(a, b, c)                                    \
-   __extension__                                                         \
-     ({                                                                  \
--       int32x2_t c_ = (c);                                              \
-        int32x2_t b_ = (b);                                              \
--       int64x2_t a_ = (a);                                              \
-+       int32x4_t a_ = (a);                                              \
-        int64x2_t result;                                                \
--       __asm__ ("smlsl %0.2d, %2.2s, %3.s[%4]"                          \
-+       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                 : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-+                : "w"(a_), "w"(b_), "i"(c)                              \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlsl_lane_u16(a, b, c, d)                                      \
-+#define vmull_high_lane_u16(a, b, c)                                    \
-   __extension__                                                         \
-     ({                                                                  \
--       uint16x4_t c_ = (c);                                             \
-        uint16x4_t b_ = (b);                                             \
--       uint32x4_t a_ = (a);                                             \
-+       uint16x8_t a_ = (a);                                             \
-        uint32x4_t result;                                               \
--       __asm__ ("umlsl %0.4s, %2.4h, %3.h[%4]"                          \
-+       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                 : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-+                : "w"(a_), "x"(b_), "i"(c)                              \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlsl_lane_u32(a, b, c, d)                                      \
-+#define vmull_high_lane_u32(a, b, c)                                    \
-   __extension__                                                         \
-     ({                                                                  \
--       uint32x2_t c_ = (c);                                             \
-        uint32x2_t b_ = (b);                                             \
--       uint64x2_t a_ = (a);                                             \
-+       uint32x4_t a_ = (a);                                             \
-        uint64x2_t result;                                               \
--       __asm__ ("umlsl %0.2d, %2.2s, %3.s[%4]"                          \
-+       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                 : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-+                : "w"(a_), "w"(b_), "i"(c)                              \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlsl_laneq_s16(a, b, c, d)                                     \
-+#define vmull_high_laneq_s16(a, b, c)                                   \
-   __extension__                                                         \
-     ({                                                                  \
--       int16x8_t c_ = (c);                                              \
--       int16x4_t b_ = (b);                                              \
--       int32x4_t a_ = (a);                                              \
-+       int16x8_t b_ = (b);                                              \
-+       int16x8_t a_ = (a);                                              \
-        int32x4_t result;                                                \
--       __asm__ ("smlsl %0.4s, %2.4h, %3.h[%4]"                          \
-+       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                 : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-+                : "w"(a_), "x"(b_), "i"(c)                              \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlsl_laneq_s32(a, b, c, d)                                     \
-+#define vmull_high_laneq_s32(a, b, c)                                   \
-   __extension__                                                         \
-     ({                                                                  \
--       int32x4_t c_ = (c);                                              \
--       int32x2_t b_ = (b);                                              \
--       int64x2_t a_ = (a);                                              \
-+       int32x4_t b_ = (b);                                              \
-+       int32x4_t a_ = (a);                                              \
-        int64x2_t result;                                                \
--       __asm__ ("smlsl %0.2d, %2.2s, %3.s[%4]"                          \
-+       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                 : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-+                : "w"(a_), "w"(b_), "i"(c)                              \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlsl_laneq_u16(a, b, c, d)                                     \
-+#define vmull_high_laneq_u16(a, b, c)                                   \
-   __extension__                                                         \
-     ({                                                                  \
--       uint16x8_t c_ = (c);                                             \
--       uint16x4_t b_ = (b);                                             \
--       uint32x4_t a_ = (a);                                             \
-+       uint16x8_t b_ = (b);                                             \
-+       uint16x8_t a_ = (a);                                             \
-        uint32x4_t result;                                               \
--       __asm__ ("umlsl %0.4s, %2.4h, %3.h[%4]"                          \
-+       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
-                 : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-+                : "w"(a_), "x"(b_), "i"(c)                              \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmlsl_laneq_u32(a, b, c, d)                                     \
-+#define vmull_high_laneq_u32(a, b, c)                                   \
-   __extension__                                                         \
-     ({                                                                  \
--       uint32x4_t c_ = (c);                                             \
--       uint32x2_t b_ = (b);                                             \
--       uint64x2_t a_ = (a);                                             \
-+       uint32x4_t b_ = (b);                                             \
-+       uint32x4_t a_ = (a);                                             \
-        uint64x2_t result;                                               \
--       __asm__ ("umlsl %0.2d, %2.2s, %3.s[%4]"                          \
-+       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
-                 : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-+                : "w"(a_), "w"(b_), "i"(c)                              \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmlsl_n_s16 (int32x4_t a, int16x4_t b, int16_t c)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_high_n_s16 (int16x8_t a, int16_t b)
- {
-   int32x4_t result;
--  __asm__ ("smlsl %0.4s, %2.4h, %3.h[0]"
-+  __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]"
-            : "=w"(result)
--           : "0"(a), "w"(b), "x"(c)
-+           : "w"(a), "x"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vmlsl_n_s32 (int64x2_t a, int32x2_t b, int32_t c)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_high_n_s32 (int32x4_t a, int32_t b)
- {
-   int64x2_t result;
--  __asm__ ("smlsl %0.2d, %2.2s, %3.s[0]"
-+  __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmlsl_n_u16 (uint32x4_t a, uint16x4_t b, uint16_t c)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_high_n_u16 (uint16x8_t a, uint16_t b)
- {
-   uint32x4_t result;
--  __asm__ ("umlsl %0.4s, %2.4h, %3.h[0]"
-+  __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]"
-            : "=w"(result)
--           : "0"(a), "w"(b), "x"(c)
-+           : "w"(a), "x"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vmlsl_n_u32 (uint64x2_t a, uint32x2_t b, uint32_t c)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_high_n_u32 (uint32x4_t a, uint32_t b)
- {
-   uint64x2_t result;
--  __asm__ ("umlsl %0.2d, %2.2s, %3.s[0]"
-+  __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmlsl_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_high_p8 (poly8x16_t a, poly8x16_t b)
- {
--  int16x8_t result;
--  __asm__ ("smlsl %0.8h, %2.8b, %3.8b"
-+  poly16x8_t result;
-+  __asm__ ("pmull2 %0.8h,%1.16b,%2.16b"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmlsl_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_high_s8 (int8x16_t a, int8x16_t b)
- {
--  int32x4_t result;
--  __asm__ ("smlsl %0.4s, %2.4h, %3.4h"
-+  int16x8_t result;
-+  __asm__ ("smull2 %0.8h,%1.16b,%2.16b"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vmlsl_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_high_s16 (int16x8_t a, int16x8_t b)
- {
--  int64x2_t result;
--  __asm__ ("smlsl %0.2d, %2.2s, %3.2s"
-+  int32x4_t result;
-+  __asm__ ("smull2 %0.4s,%1.8h,%2.8h"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmlsl_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_high_s32 (int32x4_t a, int32x4_t b)
- {
--  uint16x8_t result;
--  __asm__ ("umlsl %0.8h, %2.8b, %3.8b"
-+  int64x2_t result;
-+  __asm__ ("smull2 %0.2d,%1.4s,%2.4s"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmlsl_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_high_u8 (uint8x16_t a, uint8x16_t b)
- {
--  uint32x4_t result;
--  __asm__ ("umlsl %0.4s, %2.4h, %3.4h"
-+  uint16x8_t result;
-+  __asm__ ("umull2 %0.8h,%1.16b,%2.16b"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vmlsl_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_high_u16 (uint16x8_t a, uint16x8_t b)
- {
--  uint64x2_t result;
--  __asm__ ("umlsl %0.2d, %2.2s, %3.2s"
-+  uint32x4_t result;
-+  __asm__ ("umull2 %0.4s,%1.8h,%2.8h"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vmlsq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_high_u32 (uint32x4_t a, uint32x4_t b)
- {
--  float32x4_t result;
--  float32x4_t t1;
--  __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fsub %0.4s, %0.4s, %1.4s"
--           : "=w"(result), "=w"(t1)
--           : "0"(a), "w"(b), "w"(c)
-+  uint64x2_t result;
-+  __asm__ ("umull2 %0.2d,%1.4s,%2.4s"
-+           : "=w"(result)
-+           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmlsq_n_s16 (int16x8_t a, int16x8_t b, int16_t c)
-+#define vmull_lane_s16(a, b, c)                                         \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int16x4_t b_ = (b);                                              \
-+       int16x4_t a_ = (a);                                              \
-+       int32x4_t result;                                                \
-+       __asm__ ("smull %0.4s,%1.4h,%2.h[%3]"                            \
-+                : "=w"(result)                                          \
-+                : "w"(a_), "x"(b_), "i"(c)                              \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vmull_lane_s32(a, b, c)                                         \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int32x2_t b_ = (b);                                              \
-+       int32x2_t a_ = (a);                                              \
-+       int64x2_t result;                                                \
-+       __asm__ ("smull %0.2d,%1.2s,%2.s[%3]"                            \
-+                : "=w"(result)                                          \
-+                : "w"(a_), "w"(b_), "i"(c)                              \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vmull_lane_u16(a, b, c)                                         \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       uint16x4_t b_ = (b);                                             \
-+       uint16x4_t a_ = (a);                                             \
-+       uint32x4_t result;                                               \
-+       __asm__ ("umull %0.4s,%1.4h,%2.h[%3]"                            \
-+                : "=w"(result)                                          \
-+                : "w"(a_), "x"(b_), "i"(c)                              \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vmull_lane_u32(a, b, c)                                         \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       uint32x2_t b_ = (b);                                             \
-+       uint32x2_t a_ = (a);                                             \
-+       uint64x2_t result;                                               \
-+       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
-+                : "=w"(result)                                          \
-+                : "w"(a_), "w"(b_), "i"(c)                              \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vmull_laneq_s16(a, b, c)                                        \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int16x8_t b_ = (b);                                              \
-+       int16x4_t a_ = (a);                                              \
-+       int32x4_t result;                                                \
-+       __asm__ ("smull %0.4s, %1.4h, %2.h[%3]"                          \
-+                : "=w"(result)                                          \
-+                : "w"(a_), "x"(b_), "i"(c)                              \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vmull_laneq_s32(a, b, c)                                        \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int32x4_t b_ = (b);                                              \
-+       int32x2_t a_ = (a);                                              \
-+       int64x2_t result;                                                \
-+       __asm__ ("smull %0.2d, %1.2s, %2.s[%3]"                          \
-+                : "=w"(result)                                          \
-+                : "w"(a_), "w"(b_), "i"(c)                              \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vmull_laneq_u16(a, b, c)                                        \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       uint16x8_t b_ = (b);                                             \
-+       uint16x4_t a_ = (a);                                             \
-+       uint32x4_t result;                                               \
-+       __asm__ ("umull %0.4s, %1.4h, %2.h[%3]"                          \
-+                : "=w"(result)                                          \
-+                : "w"(a_), "x"(b_), "i"(c)                              \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vmull_laneq_u32(a, b, c)                                        \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       uint32x4_t b_ = (b);                                             \
-+       uint32x2_t a_ = (a);                                             \
-+       uint64x2_t result;                                               \
-+       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
-+                : "=w"(result)                                          \
-+                : "w"(a_), "w"(b_), "i"(c)                              \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_n_s16 (int16x4_t a, int16_t b)
- {
--  int16x8_t result;
--  __asm__ ("mls %0.8h, %2.8h, %3.h[0]"
-+  int32x4_t result;
-+  __asm__ ("smull %0.4s,%1.4h,%2.h[0]"
-            : "=w"(result)
--           : "0"(a), "w"(b), "x"(c)
-+           : "w"(a), "x"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmlsq_n_s32 (int32x4_t a, int32x4_t b, int32_t c)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_n_s32 (int32x2_t a, int32_t b)
- {
--  int32x4_t result;
--  __asm__ ("mls %0.4s, %2.4s, %3.s[0]"
-+  int64x2_t result;
-+  __asm__ ("smull %0.2d,%1.2s,%2.s[0]"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmlsq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_n_u16 (uint16x4_t a, uint16_t b)
- {
--  uint16x8_t result;
--  __asm__ ("mls %0.8h, %2.8h, %3.h[0]"
-+  uint32x4_t result;
-+  __asm__ ("umull %0.4s,%1.4h,%2.h[0]"
-            : "=w"(result)
--           : "0"(a), "w"(b), "x"(c)
-+           : "w"(a), "x"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmlsq_n_u32 (uint32x4_t a, uint32x4_t b, uint32_t c)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_n_u32 (uint32x2_t a, uint32_t b)
- {
--  uint32x4_t result;
--  __asm__ ("mls %0.4s, %2.4s, %3.s[0]"
-+  uint64x2_t result;
-+  __asm__ ("umull %0.2d,%1.2s,%2.s[0]"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vmlsq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_p8 (poly8x8_t a, poly8x8_t b)
- {
--  int8x16_t result;
--  __asm__ ("mls %0.16b,%2.16b,%3.16b"
-+  poly16x8_t result;
-+  __asm__ ("pmull %0.8h, %1.8b, %2.8b"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmlsq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_s8 (int8x8_t a, int8x8_t b)
- {
-   int16x8_t result;
--  __asm__ ("mls %0.8h,%2.8h,%3.8h"
-+  __asm__ ("smull %0.8h, %1.8b, %2.8b"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmlsq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_s16 (int16x4_t a, int16x4_t b)
- {
-   int32x4_t result;
--  __asm__ ("mls %0.4s,%2.4s,%3.4s"
-+  __asm__ ("smull %0.4s, %1.4h, %2.4h"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vmlsq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_s32 (int32x2_t a, int32x2_t b)
- {
--  uint8x16_t result;
--  __asm__ ("mls %0.16b,%2.16b,%3.16b"
-+  int64x2_t result;
-+  __asm__ ("smull %0.2d, %1.2s, %2.2s"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmlsq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_u8 (uint8x8_t a, uint8x8_t b)
- {
-   uint16x8_t result;
--  __asm__ ("mls %0.8h,%2.8h,%3.8h"
-+  __asm__ ("umull %0.8h, %1.8b, %2.8b"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmlsq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_u16 (uint16x4_t a, uint16x4_t b)
- {
-   uint32x4_t result;
--  __asm__ ("mls %0.4s,%2.4s,%3.4s"
-+  __asm__ ("umull %0.4s, %1.4h, %2.4h"
-            : "=w"(result)
--           : "0"(a), "w"(b), "w"(c)
-+           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmovl_high_s8 (int8x16_t a)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_u32 (uint32x2_t a, uint32x2_t b)
- {
--  int16x8_t result;
--  __asm__ ("sshll2 %0.8h,%1.16b,#0"
-+  uint64x2_t result;
-+  __asm__ ("umull %0.2d, %1.2s, %2.2s"
-            : "=w"(result)
--           : "w"(a)
-+           : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmovl_high_s16 (int16x8_t a)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadal_s8 (int16x4_t a, int8x8_t b)
- {
--  int32x4_t result;
--  __asm__ ("sshll2 %0.4s,%1.8h,#0"
-+  int16x4_t result;
-+  __asm__ ("sadalp %0.4h,%2.8b"
-            : "=w"(result)
--           : "w"(a)
-+           : "0"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vmovl_high_s32 (int32x4_t a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadal_s16 (int32x2_t a, int16x4_t b)
- {
--  int64x2_t result;
--  __asm__ ("sshll2 %0.2d,%1.4s,#0"
-+  int32x2_t result;
-+  __asm__ ("sadalp %0.2s,%2.4h"
-            : "=w"(result)
--           : "w"(a)
-+           : "0"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmovl_high_u8 (uint8x16_t a)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadal_s32 (int64x1_t a, int32x2_t b)
- {
--  uint16x8_t result;
--  __asm__ ("ushll2 %0.8h,%1.16b,#0"
-+  int64x1_t result;
-+  __asm__ ("sadalp %0.1d,%2.2s"
-            : "=w"(result)
--           : "w"(a)
-+           : "0"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmovl_high_u16 (uint16x8_t a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadal_u8 (uint16x4_t a, uint8x8_t b)
- {
--  uint32x4_t result;
--  __asm__ ("ushll2 %0.4s,%1.8h,#0"
-+  uint16x4_t result;
-+  __asm__ ("uadalp %0.4h,%2.8b"
-            : "=w"(result)
--           : "w"(a)
-+           : "0"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vmovl_high_u32 (uint32x4_t a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadal_u16 (uint32x2_t a, uint16x4_t b)
- {
--  uint64x2_t result;
--  __asm__ ("ushll2 %0.2d,%1.4s,#0"
-+  uint32x2_t result;
-+  __asm__ ("uadalp %0.2s,%2.4h"
-            : "=w"(result)
--           : "w"(a)
-+           : "0"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmovl_s8 (int8x8_t a)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadal_u32 (uint64x1_t a, uint32x2_t b)
-+{
-+  uint64x1_t result;
-+  __asm__ ("uadalp %0.1d,%2.2s"
-+           : "=w"(result)
-+           : "0"(a), "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadalq_s8 (int16x8_t a, int8x16_t b)
- {
-   int16x8_t result;
--  __asm__ ("sshll %0.8h,%1.8b,#0"
-+  __asm__ ("sadalp %0.8h,%2.16b"
-            : "=w"(result)
--           : "w"(a)
-+           : "0"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmovl_s16 (int16x4_t a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadalq_s16 (int32x4_t a, int16x8_t b)
- {
-   int32x4_t result;
--  __asm__ ("sshll %0.4s,%1.4h,#0"
-+  __asm__ ("sadalp %0.4s,%2.8h"
-            : "=w"(result)
--           : "w"(a)
-+           : "0"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vmovl_s32 (int32x2_t a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadalq_s32 (int64x2_t a, int32x4_t b)
- {
-   int64x2_t result;
--  __asm__ ("sshll %0.2d,%1.2s,#0"
-+  __asm__ ("sadalp %0.2d,%2.4s"
-            : "=w"(result)
--           : "w"(a)
-+           : "0"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmovl_u8 (uint8x8_t a)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadalq_u8 (uint16x8_t a, uint8x16_t b)
- {
-   uint16x8_t result;
--  __asm__ ("ushll %0.8h,%1.8b,#0"
-+  __asm__ ("uadalp %0.8h,%2.16b"
-            : "=w"(result)
--           : "w"(a)
-+           : "0"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmovl_u16 (uint16x4_t a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadalq_u16 (uint32x4_t a, uint16x8_t b)
- {
-   uint32x4_t result;
--  __asm__ ("ushll %0.4s,%1.4h,#0"
-+  __asm__ ("uadalp %0.4s,%2.8h"
-            : "=w"(result)
--           : "w"(a)
-+           : "0"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vmovl_u32 (uint32x2_t a)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadalq_u32 (uint64x2_t a, uint32x4_t b)
- {
-   uint64x2_t result;
--  __asm__ ("ushll %0.2d,%1.2s,#0"
-+  __asm__ ("uadalp %0.2d,%2.4s"
-            : "=w"(result)
--           : "w"(a)
-+           : "0"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vmovn_high_s16 (int8x8_t a, int16x8_t b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddl_s8 (int8x8_t a)
- {
--  int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
--  __asm__ ("xtn2 %0.16b,%1.8h"
--           : "+w"(result)
--           : "w"(b)
-+  int16x4_t result;
-+  __asm__ ("saddlp %0.4h,%1.8b"
-+           : "=w"(result)
-+           : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmovn_high_s32 (int16x4_t a, int32x4_t b)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddl_s16 (int16x4_t a)
- {
--  int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
--  __asm__ ("xtn2 %0.8h,%1.4s"
--           : "+w"(result)
--           : "w"(b)
-+  int32x2_t result;
-+  __asm__ ("saddlp %0.2s,%1.4h"
-+           : "=w"(result)
-+           : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmovn_high_s64 (int32x2_t a, int64x2_t b)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddl_s32 (int32x2_t a)
- {
--  int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
--  __asm__ ("xtn2 %0.4s,%1.2d"
--           : "+w"(result)
--           : "w"(b)
-+  int64x1_t result;
-+  __asm__ ("saddlp %0.1d,%1.2s"
-+           : "=w"(result)
-+           : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vmovn_high_u16 (uint8x8_t a, uint16x8_t b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddl_u8 (uint8x8_t a)
- {
--  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
--  __asm__ ("xtn2 %0.16b,%1.8h"
--           : "+w"(result)
--           : "w"(b)
-+  uint16x4_t result;
-+  __asm__ ("uaddlp %0.4h,%1.8b"
-+           : "=w"(result)
-+           : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmovn_high_u32 (uint16x4_t a, uint32x4_t b)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddl_u16 (uint16x4_t a)
- {
--  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
--  __asm__ ("xtn2 %0.8h,%1.4s"
--           : "+w"(result)
--           : "w"(b)
-+  uint32x2_t result;
-+  __asm__ ("uaddlp %0.2s,%1.4h"
-+           : "=w"(result)
-+           : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmovn_high_u64 (uint32x2_t a, uint64x2_t b)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddl_u32 (uint32x2_t a)
- {
--  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
--  __asm__ ("xtn2 %0.4s,%1.2d"
--           : "+w"(result)
--           : "w"(b)
-+  uint64x1_t result;
-+  __asm__ ("uaddlp %0.1d,%1.2s"
-+           : "=w"(result)
-+           : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vmovn_s16 (int16x8_t a)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddlq_s8 (int8x16_t a)
- {
--  int8x8_t result;
--  __asm__ ("xtn %0.8b,%1.8h"
-+  int16x8_t result;
-+  __asm__ ("saddlp %0.8h,%1.16b"
-            : "=w"(result)
-            : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vmovn_s32 (int32x4_t a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddlq_s16 (int16x8_t a)
- {
--  int16x4_t result;
--  __asm__ ("xtn %0.4h,%1.4s"
-+  int32x4_t result;
-+  __asm__ ("saddlp %0.4s,%1.8h"
-            : "=w"(result)
-            : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vmovn_s64 (int64x2_t a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddlq_s32 (int32x4_t a)
- {
--  int32x2_t result;
--  __asm__ ("xtn %0.2s,%1.2d"
-+  int64x2_t result;
-+  __asm__ ("saddlp %0.2d,%1.4s"
-            : "=w"(result)
-            : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vmovn_u16 (uint16x8_t a)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddlq_u8 (uint8x16_t a)
- {
--  uint8x8_t result;
--  __asm__ ("xtn %0.8b,%1.8h"
-+  uint16x8_t result;
-+  __asm__ ("uaddlp %0.8h,%1.16b"
-            : "=w"(result)
-            : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vmovn_u32 (uint32x4_t a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddlq_u16 (uint16x8_t a)
- {
--  uint16x4_t result;
--  __asm__ ("xtn %0.4h,%1.4s"
-+  uint32x4_t result;
-+  __asm__ ("uaddlp %0.4s,%1.8h"
-            : "=w"(result)
-            : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vmovn_u64 (uint64x2_t a)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddlq_u32 (uint32x4_t a)
- {
--  uint32x2_t result;
--  __asm__ ("xtn %0.2s,%1.2d"
-+  uint64x2_t result;
-+  __asm__ ("uaddlp %0.2d,%1.4s"
-            : "=w"(result)
-            : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vmul_n_f32 (float32x2_t a, float32_t b)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddq_s8 (int8x16_t a, int8x16_t b)
- {
--  float32x2_t result;
--  __asm__ ("fmul %0.2s,%1.2s,%2.s[0]"
-+  int8x16_t result;
-+  __asm__ ("addp %0.16b,%1.16b,%2.16b"
-+           : "=w"(result)
-+           : "w"(a), "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddq_s16 (int16x8_t a, int16x8_t b)
-+{
-+  int16x8_t result;
-+  __asm__ ("addp %0.8h,%1.8h,%2.8h"
-+           : "=w"(result)
-+           : "w"(a), "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddq_s32 (int32x4_t a, int32x4_t b)
-+{
-+  int32x4_t result;
-+  __asm__ ("addp %0.4s,%1.4s,%2.4s"
-+           : "=w"(result)
-+           : "w"(a), "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddq_s64 (int64x2_t a, int64x2_t b)
-+{
-+  int64x2_t result;
-+  __asm__ ("addp %0.2d,%1.2d,%2.2d"
-+           : "=w"(result)
-+           : "w"(a), "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddq_u8 (uint8x16_t a, uint8x16_t b)
-+{
-+  uint8x16_t result;
-+  __asm__ ("addp %0.16b,%1.16b,%2.16b"
-+           : "=w"(result)
-+           : "w"(a), "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddq_u16 (uint16x8_t a, uint16x8_t b)
-+{
-+  uint16x8_t result;
-+  __asm__ ("addp %0.8h,%1.8h,%2.8h"
-+           : "=w"(result)
-+           : "w"(a), "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddq_u32 (uint32x4_t a, uint32x4_t b)
-+{
-+  uint32x4_t result;
-+  __asm__ ("addp %0.4s,%1.4s,%2.4s"
-+           : "=w"(result)
-+           : "w"(a), "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddq_u64 (uint64x2_t a, uint64x2_t b)
-+{
-+  uint64x2_t result;
-+  __asm__ ("addp %0.2d,%1.2d,%2.2d"
-            : "=w"(result)
-            : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vmul_n_s16 (int16x4_t a, int16_t b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulh_n_s16 (int16x4_t a, int16_t b)
- {
-   int16x4_t result;
--  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-+  __asm__ ("sqdmulh %0.4h,%1.4h,%2.h[0]"
-            : "=w"(result)
-            : "w"(a), "x"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vmul_n_s32 (int32x2_t a, int32_t b)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulh_n_s32 (int32x2_t a, int32_t b)
- {
-   int32x2_t result;
--  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-+  __asm__ ("sqdmulh %0.2s,%1.2s,%2.s[0]"
-            : "=w"(result)
-            : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vmul_n_u16 (uint16x4_t a, uint16_t b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulhq_n_s16 (int16x8_t a, int16_t b)
- {
--  uint16x4_t result;
--  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-+  int16x8_t result;
-+  __asm__ ("sqdmulh %0.8h,%1.8h,%2.h[0]"
-            : "=w"(result)
-            : "w"(a), "x"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vmul_n_u32 (uint32x2_t a, uint32_t b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulhq_n_s32 (int32x4_t a, int32_t b)
- {
--  uint32x2_t result;
--  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-+  int32x4_t result;
-+  __asm__ ("sqdmulh %0.4s,%1.4s,%2.s[0]"
-            : "=w"(result)
-            : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--#define vmull_high_lane_s16(a, b, c)                                    \
--  __extension__                                                         \
--    ({                                                                  \
--       int16x4_t b_ = (b);                                              \
--       int16x8_t a_ = (a);                                              \
--       int32x4_t result;                                                \
--       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
--                : "=w"(result)                                          \
--                : "w"(a_), "x"(b_), "i"(c)                              \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovn_high_s16 (int8x8_t a, int16x8_t b)
-+{
-+  int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("sqxtn2 %0.16b, %1.8h"
-+           : "+w"(result)
-+           : "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
- 
--#define vmull_high_lane_s32(a, b, c)                                    \
--  __extension__                                                         \
--    ({                                                                  \
--       int32x2_t b_ = (b);                                              \
--       int32x4_t a_ = (a);                                              \
--       int64x2_t result;                                                \
--       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
--                : "=w"(result)                                          \
--                : "w"(a_), "w"(b_), "i"(c)                              \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
--
--#define vmull_high_lane_u16(a, b, c)                                    \
--  __extension__                                                         \
--    ({                                                                  \
--       uint16x4_t b_ = (b);                                             \
--       uint16x8_t a_ = (a);                                             \
--       uint32x4_t result;                                               \
--       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
--                : "=w"(result)                                          \
--                : "w"(a_), "x"(b_), "i"(c)                              \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
--
--#define vmull_high_lane_u32(a, b, c)                                    \
--  __extension__                                                         \
--    ({                                                                  \
--       uint32x2_t b_ = (b);                                             \
--       uint32x4_t a_ = (a);                                             \
--       uint64x2_t result;                                               \
--       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
--                : "=w"(result)                                          \
--                : "w"(a_), "w"(b_), "i"(c)                              \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
--
--#define vmull_high_laneq_s16(a, b, c)                                   \
--  __extension__                                                         \
--    ({                                                                  \
--       int16x8_t b_ = (b);                                              \
--       int16x8_t a_ = (a);                                              \
--       int32x4_t result;                                                \
--       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
--                : "=w"(result)                                          \
--                : "w"(a_), "x"(b_), "i"(c)                              \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
--
--#define vmull_high_laneq_s32(a, b, c)                                   \
--  __extension__                                                         \
--    ({                                                                  \
--       int32x4_t b_ = (b);                                              \
--       int32x4_t a_ = (a);                                              \
--       int64x2_t result;                                                \
--       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
--                : "=w"(result)                                          \
--                : "w"(a_), "w"(b_), "i"(c)                              \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
--
--#define vmull_high_laneq_u16(a, b, c)                                   \
--  __extension__                                                         \
--    ({                                                                  \
--       uint16x8_t b_ = (b);                                             \
--       uint16x8_t a_ = (a);                                             \
--       uint32x4_t result;                                               \
--       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
--                : "=w"(result)                                          \
--                : "w"(a_), "x"(b_), "i"(c)                              \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
--
--#define vmull_high_laneq_u32(a, b, c)                                   \
--  __extension__                                                         \
--    ({                                                                  \
--       uint32x4_t b_ = (b);                                             \
--       uint32x4_t a_ = (a);                                             \
--       uint64x2_t result;                                               \
--       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
--                : "=w"(result)                                          \
--                : "w"(a_), "w"(b_), "i"(c)                              \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovn_high_s32 (int16x4_t a, int32x4_t b)
-+{
-+  int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("sqxtn2 %0.8h, %1.4s"
-+           : "+w"(result)
-+           : "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmull_high_n_s16 (int16x8_t a, int16_t b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovn_high_s64 (int32x2_t a, int64x2_t b)
- {
--  int32x4_t result;
--  __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]"
--           : "=w"(result)
--           : "w"(a), "x"(b)
-+  int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("sqxtn2 %0.4s, %1.2d"
-+           : "+w"(result)
-+           : "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vmull_high_n_s32 (int32x4_t a, int32_t b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovn_high_u16 (uint8x8_t a, uint16x8_t b)
- {
--  int64x2_t result;
--  __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]"
--           : "=w"(result)
--           : "w"(a), "w"(b)
-+  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("uqxtn2 %0.16b, %1.8h"
-+           : "+w"(result)
-+           : "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmull_high_n_u16 (uint16x8_t a, uint16_t b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovn_high_u32 (uint16x4_t a, uint32x4_t b)
- {
--  uint32x4_t result;
--  __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]"
--           : "=w"(result)
--           : "w"(a), "x"(b)
-+  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("uqxtn2 %0.8h, %1.4s"
-+           : "+w"(result)
-+           : "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vmull_high_n_u32 (uint32x4_t a, uint32_t b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovn_high_u64 (uint32x2_t a, uint64x2_t b)
- {
--  uint64x2_t result;
--  __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]"
--           : "=w"(result)
--           : "w"(a), "w"(b)
-+  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("uqxtn2 %0.4s, %1.2d"
-+           : "+w"(result)
-+           : "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vmull_high_p8 (poly8x16_t a, poly8x16_t b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovun_high_s16 (uint8x8_t a, int16x8_t b)
- {
--  poly16x8_t result;
--  __asm__ ("pmull2 %0.8h,%1.16b,%2.16b"
--           : "=w"(result)
--           : "w"(a), "w"(b)
-+  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("sqxtun2 %0.16b, %1.8h"
-+           : "+w"(result)
-+           : "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmull_high_s8 (int8x16_t a, int8x16_t b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovun_high_s32 (uint16x4_t a, int32x4_t b)
- {
--  int16x8_t result;
--  __asm__ ("smull2 %0.8h,%1.16b,%2.16b"
--           : "=w"(result)
--           : "w"(a), "w"(b)
-+  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("sqxtun2 %0.8h, %1.4s"
-+           : "+w"(result)
-+           : "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmull_high_s16 (int16x8_t a, int16x8_t b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovun_high_s64 (uint32x2_t a, int64x2_t b)
- {
--  int32x4_t result;
--  __asm__ ("smull2 %0.4s,%1.8h,%2.8h"
--           : "=w"(result)
--           : "w"(a), "w"(b)
-+  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("sqxtun2 %0.4s, %1.2d"
-+           : "+w"(result)
-+           : "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vmull_high_s32 (int32x4_t a, int32x4_t b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulh_n_s16 (int16x4_t a, int16_t b)
- {
--  int64x2_t result;
--  __asm__ ("smull2 %0.2d,%1.4s,%2.4s"
-+  int16x4_t result;
-+  __asm__ ("sqrdmulh %0.4h,%1.4h,%2.h[0]"
-            : "=w"(result)
--           : "w"(a), "w"(b)
-+           : "w"(a), "x"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmull_high_u8 (uint8x16_t a, uint8x16_t b)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulh_n_s32 (int32x2_t a, int32_t b)
- {
--  uint16x8_t result;
--  __asm__ ("umull2 %0.8h,%1.16b,%2.16b"
-+  int32x2_t result;
-+  __asm__ ("sqrdmulh %0.2s,%1.2s,%2.s[0]"
-            : "=w"(result)
-            : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmull_high_u16 (uint16x8_t a, uint16x8_t b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulhq_n_s16 (int16x8_t a, int16_t b)
- {
--  uint32x4_t result;
--  __asm__ ("umull2 %0.4s,%1.8h,%2.8h"
-+  int16x8_t result;
-+  __asm__ ("sqrdmulh %0.8h,%1.8h,%2.h[0]"
-            : "=w"(result)
--           : "w"(a), "w"(b)
-+           : "w"(a), "x"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vmull_high_u32 (uint32x4_t a, uint32x4_t b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulhq_n_s32 (int32x4_t a, int32_t b)
- {
--  uint64x2_t result;
--  __asm__ ("umull2 %0.2d,%1.4s,%2.4s"
-+  int32x4_t result;
-+  __asm__ ("sqrdmulh %0.4s,%1.4s,%2.s[0]"
-            : "=w"(result)
-            : "w"(a), "w"(b)
-            : /* No clobbers */);
-   return result;
- }
- 
--#define vmull_lane_s16(a, b, c)                                         \
-+#define vqrshrn_high_n_s16(a, b, c)                                     \
-   __extension__                                                         \
-     ({                                                                  \
--       int16x4_t b_ = (b);                                              \
-+       int16x8_t b_ = (b);                                              \
-+       int8x8_t a_ = (a);                                               \
-+       int8x16_t result = vcombine_s8                                   \
-+                            (a_, vcreate_s8                             \
-+                                   (__AARCH64_UINT64_C (0x0)));         \
-+       __asm__ ("sqrshrn2 %0.16b, %1.8h, #%2"                           \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vqrshrn_high_n_s32(a, b, c)                                     \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int32x4_t b_ = (b);                                              \
-        int16x4_t a_ = (a);                                              \
--       int32x4_t result;                                                \
--       __asm__ ("smull %0.4s,%1.4h,%2.h[%3]"                            \
--                : "=w"(result)                                          \
--                : "w"(a_), "x"(b_), "i"(c)                              \
-+       int16x8_t result = vcombine_s16                                  \
-+                            (a_, vcreate_s16                            \
-+                                   (__AARCH64_UINT64_C (0x0)));         \
-+       __asm__ ("sqrshrn2 %0.8h, %1.4s, #%2"                            \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmull_lane_s32(a, b, c)                                         \
-+#define vqrshrn_high_n_s64(a, b, c)                                     \
-   __extension__                                                         \
-     ({                                                                  \
--       int32x2_t b_ = (b);                                              \
-+       int64x2_t b_ = (b);                                              \
-        int32x2_t a_ = (a);                                              \
--       int64x2_t result;                                                \
--       __asm__ ("smull %0.2d,%1.2s,%2.s[%3]"                            \
--                : "=w"(result)                                          \
--                : "w"(a_), "w"(b_), "i"(c)                              \
-+       int32x4_t result = vcombine_s32                                  \
-+                            (a_, vcreate_s32                            \
-+                                   (__AARCH64_UINT64_C (0x0)));         \
-+       __asm__ ("sqrshrn2 %0.4s, %1.2d, #%2"                            \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmull_lane_u16(a, b, c)                                         \
-+#define vqrshrn_high_n_u16(a, b, c)                                     \
-   __extension__                                                         \
-     ({                                                                  \
--       uint16x4_t b_ = (b);                                             \
--       uint16x4_t a_ = (a);                                             \
--       uint32x4_t result;                                               \
--       __asm__ ("umull %0.4s,%1.4h,%2.h[%3]"                            \
--                : "=w"(result)                                          \
--                : "w"(a_), "x"(b_), "i"(c)                              \
-+       uint16x8_t b_ = (b);                                             \
-+       uint8x8_t a_ = (a);                                              \
-+       uint8x16_t result = vcombine_u8                                  \
-+                             (a_, vcreate_u8                            \
-+                                    (__AARCH64_UINT64_C (0x0)));        \
-+       __asm__ ("uqrshrn2 %0.16b, %1.8h, #%2"                           \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmull_lane_u32(a, b, c)                                         \
-+#define vqrshrn_high_n_u32(a, b, c)                                     \
-   __extension__                                                         \
-     ({                                                                  \
--       uint32x2_t b_ = (b);                                             \
-+       uint32x4_t b_ = (b);                                             \
-+       uint16x4_t a_ = (a);                                             \
-+       uint16x8_t result = vcombine_u16                                 \
-+                             (a_, vcreate_u16                           \
-+                                    (__AARCH64_UINT64_C (0x0)));        \
-+       __asm__ ("uqrshrn2 %0.8h, %1.4s, #%2"                            \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vqrshrn_high_n_u64(a, b, c)                                     \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       uint64x2_t b_ = (b);                                             \
-        uint32x2_t a_ = (a);                                             \
--       uint64x2_t result;                                               \
--       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
--                : "=w"(result)                                          \
--                : "w"(a_), "w"(b_), "i"(c)                              \
-+       uint32x4_t result = vcombine_u32                                 \
-+                             (a_, vcreate_u32                           \
-+                                    (__AARCH64_UINT64_C (0x0)));        \
-+       __asm__ ("uqrshrn2 %0.4s, %1.2d, #%2"                            \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmull_laneq_s16(a, b, c)                                        \
-+#define vqrshrun_high_n_s16(a, b, c)                                    \
-   __extension__                                                         \
-     ({                                                                  \
-        int16x8_t b_ = (b);                                              \
--       int16x4_t a_ = (a);                                              \
--       int32x4_t result;                                                \
--       __asm__ ("smull %0.4s, %1.4h, %2.h[%3]"                          \
--                : "=w"(result)                                          \
--                : "w"(a_), "x"(b_), "i"(c)                              \
-+       uint8x8_t a_ = (a);                                              \
-+       uint8x16_t result = vcombine_u8                                  \
-+                             (a_, vcreate_u8                            \
-+                                    (__AARCH64_UINT64_C (0x0)));        \
-+       __asm__ ("sqrshrun2 %0.16b, %1.8h, #%2"                          \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmull_laneq_s32(a, b, c)                                        \
-+#define vqrshrun_high_n_s32(a, b, c)                                    \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int32x4_t b_ = (b);                                              \
-+       uint16x4_t a_ = (a);                                             \
-+       uint16x8_t result = vcombine_u16                                 \
-+                             (a_, vcreate_u16                           \
-+                                    (__AARCH64_UINT64_C (0x0)));        \
-+       __asm__ ("sqrshrun2 %0.8h, %1.4s, #%2"                           \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vqrshrun_high_n_s64(a, b, c)                                    \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int64x2_t b_ = (b);                                              \
-+       uint32x2_t a_ = (a);                                             \
-+       uint32x4_t result = vcombine_u32                                 \
-+                             (a_, vcreate_u32                           \
-+                                    (__AARCH64_UINT64_C (0x0)));        \
-+       __asm__ ("sqrshrun2 %0.4s, %1.2d, #%2"                           \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vqshrn_high_n_s16(a, b, c)                                      \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int16x8_t b_ = (b);                                              \
-+       int8x8_t a_ = (a);                                               \
-+       int8x16_t result = vcombine_s8                                   \
-+                            (a_, vcreate_s8                             \
-+                                   (__AARCH64_UINT64_C (0x0)));         \
-+       __asm__ ("sqshrn2 %0.16b, %1.8h, #%2"                            \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vqshrn_high_n_s32(a, b, c)                                      \
-   __extension__                                                         \
-     ({                                                                  \
-        int32x4_t b_ = (b);                                              \
-+       int16x4_t a_ = (a);                                              \
-+       int16x8_t result = vcombine_s16                                  \
-+                            (a_, vcreate_s16                            \
-+                                   (__AARCH64_UINT64_C (0x0)));         \
-+       __asm__ ("sqshrn2 %0.8h, %1.4s, #%2"                             \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vqshrn_high_n_s64(a, b, c)                                      \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int64x2_t b_ = (b);                                              \
-        int32x2_t a_ = (a);                                              \
--       int64x2_t result;                                                \
--       __asm__ ("smull %0.2d, %1.2s, %2.s[%3]"                          \
--                : "=w"(result)                                          \
--                : "w"(a_), "w"(b_), "i"(c)                              \
-+       int32x4_t result = vcombine_s32                                  \
-+                            (a_, vcreate_s32                            \
-+                                   (__AARCH64_UINT64_C (0x0)));         \
-+       __asm__ ("sqshrn2 %0.4s, %1.2d, #%2"                             \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmull_laneq_u16(a, b, c)                                        \
-+#define vqshrn_high_n_u16(a, b, c)                                      \
-   __extension__                                                         \
-     ({                                                                  \
-        uint16x8_t b_ = (b);                                             \
--       uint16x4_t a_ = (a);                                             \
--       uint32x4_t result;                                               \
--       __asm__ ("umull %0.4s, %1.4h, %2.h[%3]"                          \
--                : "=w"(result)                                          \
--                : "w"(a_), "x"(b_), "i"(c)                              \
-+       uint8x8_t a_ = (a);                                              \
-+       uint8x16_t result = vcombine_u8                                  \
-+                             (a_, vcreate_u8                            \
-+                                    (__AARCH64_UINT64_C (0x0)));        \
-+       __asm__ ("uqshrn2 %0.16b, %1.8h, #%2"                            \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--#define vmull_laneq_u32(a, b, c)                                        \
-+#define vqshrn_high_n_u32(a, b, c)                                      \
-   __extension__                                                         \
-     ({                                                                  \
-        uint32x4_t b_ = (b);                                             \
--       uint32x2_t a_ = (a);                                             \
--       uint64x2_t result;                                               \
--       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
--                : "=w"(result)                                          \
--                : "w"(a_), "w"(b_), "i"(c)                              \
-+       uint16x4_t a_ = (a);                                             \
-+       uint16x8_t result = vcombine_u16                                 \
-+                             (a_, vcreate_u16                           \
-+                                    (__AARCH64_UINT64_C (0x0)));        \
-+       __asm__ ("uqshrn2 %0.8h, %1.4s, #%2"                             \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-                 : /* No clobbers */);                                   \
-        result;                                                          \
-      })
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmull_n_s16 (int16x4_t a, int16_t b)
--{
--  int32x4_t result;
--  __asm__ ("smull %0.4s,%1.4h,%2.h[0]"
--           : "=w"(result)
--           : "w"(a), "x"(b)
--           : /* No clobbers */);
--  return result;
--}
-+#define vqshrn_high_n_u64(a, b, c)                                      \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       uint64x2_t b_ = (b);                                             \
-+       uint32x2_t a_ = (a);                                             \
-+       uint32x4_t result = vcombine_u32                                 \
-+                             (a_, vcreate_u32                           \
-+                                    (__AARCH64_UINT64_C (0x0)));        \
-+       __asm__ ("uqshrn2 %0.4s, %1.2d, #%2"                             \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vmull_n_s32 (int32x2_t a, int32_t b)
--{
--  int64x2_t result;
--  __asm__ ("smull %0.2d,%1.2s,%2.s[0]"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
-+#define vqshrun_high_n_s16(a, b, c)                                     \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int16x8_t b_ = (b);                                              \
-+       uint8x8_t a_ = (a);                                              \
-+       uint8x16_t result = vcombine_u8                                  \
-+                             (a_, vcreate_u8                            \
-+                                    (__AARCH64_UINT64_C (0x0)));        \
-+       __asm__ ("sqshrun2 %0.16b, %1.8h, #%2"                           \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmull_n_u16 (uint16x4_t a, uint16_t b)
--{
--  uint32x4_t result;
--  __asm__ ("umull %0.4s,%1.4h,%2.h[0]"
--           : "=w"(result)
--           : "w"(a), "x"(b)
--           : /* No clobbers */);
--  return result;
--}
-+#define vqshrun_high_n_s32(a, b, c)                                     \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int32x4_t b_ = (b);                                              \
-+       uint16x4_t a_ = (a);                                             \
-+       uint16x8_t result = vcombine_u16                                 \
-+                             (a_, vcreate_u16                           \
-+                                    (__AARCH64_UINT64_C (0x0)));        \
-+       __asm__ ("sqshrun2 %0.8h, %1.4s, #%2"                            \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vmull_n_u32 (uint32x2_t a, uint32_t b)
--{
--  uint64x2_t result;
--  __asm__ ("umull %0.2d,%1.2s,%2.s[0]"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
-+#define vqshrun_high_n_s64(a, b, c)                                     \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int64x2_t b_ = (b);                                              \
-+       uint32x2_t a_ = (a);                                             \
-+       uint32x4_t result = vcombine_u32                                 \
-+                             (a_, vcreate_u32                           \
-+                                    (__AARCH64_UINT64_C (0x0)));        \
-+       __asm__ ("sqshrun2 %0.4s, %1.2d, #%2"                            \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vmull_p8 (poly8x8_t a, poly8x8_t b)
--{
--  poly16x8_t result;
--  __asm__ ("pmull %0.8h, %1.8b, %2.8b"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
-+#define vrshrn_high_n_s16(a, b, c)                                      \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int16x8_t b_ = (b);                                              \
-+       int8x8_t a_ = (a);                                               \
-+       int8x16_t result = vcombine_s8                                   \
-+                            (a_, vcreate_s8                             \
-+                                   (__AARCH64_UINT64_C (0x0)));         \
-+       __asm__ ("rshrn2 %0.16b,%1.8h,#%2"                               \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmull_s8 (int8x8_t a, int8x8_t b)
--{
--  int16x8_t result;
--  __asm__ ("smull %0.8h, %1.8b, %2.8b"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
--
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmull_s16 (int16x4_t a, int16x4_t b)
--{
--  int32x4_t result;
--  __asm__ ("smull %0.4s, %1.4h, %2.4h"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
--
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vmull_s32 (int32x2_t a, int32x2_t b)
--{
--  int64x2_t result;
--  __asm__ ("smull %0.2d, %1.2s, %2.2s"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
--
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmull_u8 (uint8x8_t a, uint8x8_t b)
--{
--  uint16x8_t result;
--  __asm__ ("umull %0.8h, %1.8b, %2.8b"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
-+#define vrshrn_high_n_s32(a, b, c)                                      \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int32x4_t b_ = (b);                                              \
-+       int16x4_t a_ = (a);                                              \
-+       int16x8_t result = vcombine_s16                                  \
-+                            (a_, vcreate_s16                            \
-+                                   (__AARCH64_UINT64_C (0x0)));         \
-+       __asm__ ("rshrn2 %0.8h,%1.4s,#%2"                                \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmull_u16 (uint16x4_t a, uint16x4_t b)
--{
--  uint32x4_t result;
--  __asm__ ("umull %0.4s, %1.4h, %2.4h"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
-+#define vrshrn_high_n_s64(a, b, c)                                      \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int64x2_t b_ = (b);                                              \
-+       int32x2_t a_ = (a);                                              \
-+       int32x4_t result = vcombine_s32                                  \
-+                            (a_, vcreate_s32                            \
-+                                   (__AARCH64_UINT64_C (0x0)));         \
-+       __asm__ ("rshrn2 %0.4s,%1.2d,#%2"                                \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vmull_u32 (uint32x2_t a, uint32x2_t b)
--{
--  uint64x2_t result;
--  __asm__ ("umull %0.2d, %1.2s, %2.2s"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
-+#define vrshrn_high_n_u16(a, b, c)                                      \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       uint16x8_t b_ = (b);                                             \
-+       uint8x8_t a_ = (a);                                              \
-+       uint8x16_t result = vcombine_u8                                  \
-+                            (a_, vcreate_u8                             \
-+                                   (__AARCH64_UINT64_C (0x0)));         \
-+       __asm__ ("rshrn2 %0.16b,%1.8h,#%2"                               \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vmulq_n_f32 (float32x4_t a, float32_t b)
--{
--  float32x4_t result;
--  __asm__ ("fmul %0.4s,%1.4s,%2.s[0]"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
-+#define vrshrn_high_n_u32(a, b, c)                                      \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       uint32x4_t b_ = (b);                                             \
-+       uint16x4_t a_ = (a);                                             \
-+       uint16x8_t result = vcombine_u16                                 \
-+                            (a_, vcreate_u16                            \
-+                                   (__AARCH64_UINT64_C (0x0)));         \
-+       __asm__ ("rshrn2 %0.8h,%1.4s,#%2"                                \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vmulq_n_f64 (float64x2_t a, float64_t b)
--{
--  float64x2_t result;
--  __asm__ ("fmul %0.2d,%1.2d,%2.d[0]"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
-+#define vrshrn_high_n_u64(a, b, c)                                      \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       uint64x2_t b_ = (b);                                             \
-+       uint32x2_t a_ = (a);                                             \
-+       uint32x4_t result = vcombine_u32                                 \
-+                            (a_, vcreate_u32                            \
-+                                   (__AARCH64_UINT64_C (0x0)));         \
-+       __asm__ ("rshrn2 %0.4s,%1.2d,#%2"                                \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmulq_n_s16 (int16x8_t a, int16_t b)
--{
--  int16x8_t result;
--  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
--           : "=w"(result)
--           : "w"(a), "x"(b)
--           : /* No clobbers */);
--  return result;
--}
-+#define vrshrn_n_s16(a, b)                                              \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int16x8_t a_ = (a);                                              \
-+       int8x8_t result;                                                 \
-+       __asm__ ("rshrn %0.8b,%1.8h,%2"                                  \
-+                : "=w"(result)                                          \
-+                : "w"(a_), "i"(b)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmulq_n_s32 (int32x4_t a, int32_t b)
--{
--  int32x4_t result;
--  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
-+#define vrshrn_n_s32(a, b)                                              \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int32x4_t a_ = (a);                                              \
-+       int16x4_t result;                                                \
-+       __asm__ ("rshrn %0.4h,%1.4s,%2"                                  \
-+                : "=w"(result)                                          \
-+                : "w"(a_), "i"(b)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmulq_n_u16 (uint16x8_t a, uint16_t b)
--{
--  uint16x8_t result;
--  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
--           : "=w"(result)
--           : "w"(a), "x"(b)
--           : /* No clobbers */);
--  return result;
--}
-+#define vrshrn_n_s64(a, b)                                              \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int64x2_t a_ = (a);                                              \
-+       int32x2_t result;                                                \
-+       __asm__ ("rshrn %0.2s,%1.2d,%2"                                  \
-+                : "=w"(result)                                          \
-+                : "w"(a_), "i"(b)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmulq_n_u32 (uint32x4_t a, uint32_t b)
--{
--  uint32x4_t result;
--  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
--}
-+#define vrshrn_n_u16(a, b)                                              \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       uint16x8_t a_ = (a);                                             \
-+       uint8x8_t result;                                                \
-+       __asm__ ("rshrn %0.8b,%1.8h,%2"                                  \
-+                : "=w"(result)                                          \
-+                : "w"(a_), "i"(b)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vmvn_p8 (poly8x8_t a)
--{
--  poly8x8_t result;
--  __asm__ ("mvn %0.8b,%1.8b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
--}
-+#define vrshrn_n_u32(a, b)                                              \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       uint32x4_t a_ = (a);                                             \
-+       uint16x4_t result;                                               \
-+       __asm__ ("rshrn %0.4h,%1.4s,%2"                                  \
-+                : "=w"(result)                                          \
-+                : "w"(a_), "i"(b)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vmvn_s8 (int8x8_t a)
--{
--  int8x8_t result;
--  __asm__ ("mvn %0.8b,%1.8b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
--}
-+#define vrshrn_n_u64(a, b)                                              \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       uint64x2_t a_ = (a);                                             \
-+       uint32x2_t result;                                               \
-+       __asm__ ("rshrn %0.2s,%1.2d,%2"                                  \
-+                : "=w"(result)                                          \
-+                : "w"(a_), "i"(b)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vmvn_s16 (int16x4_t a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsqrte_u32 (uint32x2_t a)
- {
--  int16x4_t result;
--  __asm__ ("mvn %0.8b,%1.8b"
-+  uint32x2_t result;
-+  __asm__ ("ursqrte %0.2s,%1.2s"
-            : "=w"(result)
-            : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vmvn_s32 (int32x2_t a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsqrteq_u32 (uint32x4_t a)
- {
--  int32x2_t result;
--  __asm__ ("mvn %0.8b,%1.8b"
-+  uint32x4_t result;
-+  __asm__ ("ursqrte %0.4s,%1.4s"
-            : "=w"(result)
-            : "w"(a)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vmvn_u8 (uint8x8_t a)
--{
-+#define vshrn_high_n_s16(a, b, c)                                       \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int16x8_t b_ = (b);                                              \
-+       int8x8_t a_ = (a);                                               \
-+       int8x16_t result = vcombine_s8                                   \
-+                            (a_, vcreate_s8                             \
-+                                   (__AARCH64_UINT64_C (0x0)));         \
-+       __asm__ ("shrn2 %0.16b,%1.8h,#%2"                                \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vshrn_high_n_s32(a, b, c)                                       \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int32x4_t b_ = (b);                                              \
-+       int16x4_t a_ = (a);                                              \
-+       int16x8_t result = vcombine_s16                                  \
-+                            (a_, vcreate_s16                            \
-+                                   (__AARCH64_UINT64_C (0x0)));         \
-+       __asm__ ("shrn2 %0.8h,%1.4s,#%2"                                 \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vshrn_high_n_s64(a, b, c)                                       \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int64x2_t b_ = (b);                                              \
-+       int32x2_t a_ = (a);                                              \
-+       int32x4_t result = vcombine_s32                                  \
-+                            (a_, vcreate_s32                            \
-+                                   (__AARCH64_UINT64_C (0x0)));         \
-+       __asm__ ("shrn2 %0.4s,%1.2d,#%2"                                 \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vshrn_high_n_u16(a, b, c)                                       \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       uint16x8_t b_ = (b);                                             \
-+       uint8x8_t a_ = (a);                                              \
-+       uint8x16_t result = vcombine_u8                                  \
-+                            (a_, vcreate_u8                             \
-+                                   (__AARCH64_UINT64_C (0x0)));         \
-+       __asm__ ("shrn2 %0.16b,%1.8h,#%2"                                \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vshrn_high_n_u32(a, b, c)                                       \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       uint32x4_t b_ = (b);                                             \
-+       uint16x4_t a_ = (a);                                             \
-+       uint16x8_t result = vcombine_u16                                 \
-+                            (a_, vcreate_u16                            \
-+                                   (__AARCH64_UINT64_C (0x0)));         \
-+       __asm__ ("shrn2 %0.8h,%1.4s,#%2"                                 \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vshrn_high_n_u64(a, b, c)                                       \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       uint64x2_t b_ = (b);                                             \
-+       uint32x2_t a_ = (a);                                             \
-+       uint32x4_t result = vcombine_u32                                 \
-+                            (a_, vcreate_u32                            \
-+                                   (__AARCH64_UINT64_C (0x0)));         \
-+       __asm__ ("shrn2 %0.4s,%1.2d,#%2"                                 \
-+                : "+w"(result)                                          \
-+                : "w"(b_), "i"(c)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vshrn_n_s16(a, b)                                               \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int16x8_t a_ = (a);                                              \
-+       int8x8_t result;                                                 \
-+       __asm__ ("shrn %0.8b,%1.8h,%2"                                   \
-+                : "=w"(result)                                          \
-+                : "w"(a_), "i"(b)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vshrn_n_s32(a, b)                                               \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int32x4_t a_ = (a);                                              \
-+       int16x4_t result;                                                \
-+       __asm__ ("shrn %0.4h,%1.4s,%2"                                   \
-+                : "=w"(result)                                          \
-+                : "w"(a_), "i"(b)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vshrn_n_s64(a, b)                                               \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       int64x2_t a_ = (a);                                              \
-+       int32x2_t result;                                                \
-+       __asm__ ("shrn %0.2s,%1.2d,%2"                                   \
-+                : "=w"(result)                                          \
-+                : "w"(a_), "i"(b)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vshrn_n_u16(a, b)                                               \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       uint16x8_t a_ = (a);                                             \
-+       uint8x8_t result;                                                \
-+       __asm__ ("shrn %0.8b,%1.8h,%2"                                   \
-+                : "=w"(result)                                          \
-+                : "w"(a_), "i"(b)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vshrn_n_u32(a, b)                                               \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       uint32x4_t a_ = (a);                                             \
-+       uint16x4_t result;                                               \
-+       __asm__ ("shrn %0.4h,%1.4s,%2"                                   \
-+                : "=w"(result)                                          \
-+                : "w"(a_), "i"(b)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vshrn_n_u64(a, b)                                               \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       uint64x2_t a_ = (a);                                             \
-+       uint32x2_t result;                                               \
-+       __asm__ ("shrn %0.2s,%1.2d,%2"                                   \
-+                : "=w"(result)                                          \
-+                : "w"(a_), "i"(b)                                       \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vsli_n_p8(a, b, c)                                              \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       poly8x8_t b_ = (b);                                              \
-+       poly8x8_t a_ = (a);                                              \
-+       poly8x8_t result;                                                \
-+       __asm__ ("sli %0.8b,%2.8b,%3"                                    \
-+                : "=w"(result)                                          \
-+                : "0"(a_), "w"(b_), "i"(c)                              \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vsli_n_p16(a, b, c)                                             \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       poly16x4_t b_ = (b);                                             \
-+       poly16x4_t a_ = (a);                                             \
-+       poly16x4_t result;                                               \
-+       __asm__ ("sli %0.4h,%2.4h,%3"                                    \
-+                : "=w"(result)                                          \
-+                : "0"(a_), "w"(b_), "i"(c)                              \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vsliq_n_p8(a, b, c)                                             \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       poly8x16_t b_ = (b);                                             \
-+       poly8x16_t a_ = (a);                                             \
-+       poly8x16_t result;                                               \
-+       __asm__ ("sli %0.16b,%2.16b,%3"                                  \
-+                : "=w"(result)                                          \
-+                : "0"(a_), "w"(b_), "i"(c)                              \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vsliq_n_p16(a, b, c)                                            \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       poly16x8_t b_ = (b);                                             \
-+       poly16x8_t a_ = (a);                                             \
-+       poly16x8_t result;                                               \
-+       __asm__ ("sli %0.8h,%2.8h,%3"                                    \
-+                : "=w"(result)                                          \
-+                : "0"(a_), "w"(b_), "i"(c)                              \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vsri_n_p8(a, b, c)                                              \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       poly8x8_t b_ = (b);                                              \
-+       poly8x8_t a_ = (a);                                              \
-+       poly8x8_t result;                                                \
-+       __asm__ ("sri %0.8b,%2.8b,%3"                                    \
-+                : "=w"(result)                                          \
-+                : "0"(a_), "w"(b_), "i"(c)                              \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vsri_n_p16(a, b, c)                                             \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       poly16x4_t b_ = (b);                                             \
-+       poly16x4_t a_ = (a);                                             \
-+       poly16x4_t result;                                               \
-+       __asm__ ("sri %0.4h,%2.4h,%3"                                    \
-+                : "=w"(result)                                          \
-+                : "0"(a_), "w"(b_), "i"(c)                              \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vsri_n_p64(a, b, c)						\
-+  __extension__								\
-+    ({									\
-+       poly64x1_t b_ = (b);						\
-+       poly64x1_t a_ = (a);						\
-+       poly64x1_t result;						\
-+       __asm__ ("sri %d0,%d2,%3"					\
-+		: "=w"(result)						\
-+		: "0"(a_), "w"(b_), "i"(c)				\
-+		: /* No clobbers.  */);					\
-+       result;								\
-+     })
-+
-+#define vsriq_n_p8(a, b, c)                                             \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       poly8x16_t b_ = (b);                                             \
-+       poly8x16_t a_ = (a);                                             \
-+       poly8x16_t result;                                               \
-+       __asm__ ("sri %0.16b,%2.16b,%3"                                  \
-+                : "=w"(result)                                          \
-+                : "0"(a_), "w"(b_), "i"(c)                              \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vsriq_n_p16(a, b, c)                                            \
-+  __extension__                                                         \
-+    ({                                                                  \
-+       poly16x8_t b_ = (b);                                             \
-+       poly16x8_t a_ = (a);                                             \
-+       poly16x8_t result;                                               \
-+       __asm__ ("sri %0.8h,%2.8h,%3"                                    \
-+                : "=w"(result)                                          \
-+                : "0"(a_), "w"(b_), "i"(c)                              \
-+                : /* No clobbers */);                                   \
-+       result;                                                          \
-+     })
-+
-+#define vsriq_n_p64(a, b, c)						\
-+  __extension__								\
-+    ({									\
-+       poly64x2_t b_ = (b);						\
-+       poly64x2_t a_ = (a);						\
-+       poly64x2_t result;						\
-+       __asm__ ("sri %0.2d,%2.2d,%3"					\
-+		: "=w"(result)						\
-+		: "0"(a_), "w"(b_), "i"(c)				\
-+		: /* No clobbers.  */);					\
-+       result;								\
-+     })
-+
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtst_p8 (poly8x8_t a, poly8x8_t b)
-+{
-+  uint8x8_t result;
-+  __asm__ ("cmtst %0.8b, %1.8b, %2.8b"
-+           : "=w"(result)
-+           : "w"(a), "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtst_p16 (poly16x4_t a, poly16x4_t b)
-+{
-+  uint16x4_t result;
-+  __asm__ ("cmtst %0.4h, %1.4h, %2.4h"
-+           : "=w"(result)
-+           : "w"(a), "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtstq_p8 (poly8x16_t a, poly8x16_t b)
-+{
-+  uint8x16_t result;
-+  __asm__ ("cmtst %0.16b, %1.16b, %2.16b"
-+           : "=w"(result)
-+           : "w"(a), "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtstq_p16 (poly16x8_t a, poly16x8_t b)
-+{
-+  uint16x8_t result;
-+  __asm__ ("cmtst %0.8h, %1.8h, %2.8h"
-+           : "=w"(result)
-+           : "w"(a), "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+/* End of temporary inline asm implementations.  */
-+
-+/* Start of temporary inline asm for vldn, vstn and friends.  */
-+
-+/* Create struct element types for duplicating loads.
-+
-+   Create 2 element structures of:
-+
-+   +------+----+----+----+----+
-+   |      | 8  | 16 | 32 | 64 |
-+   +------+----+----+----+----+
-+   |int   | Y  | Y  | N  | N  |
-+   +------+----+----+----+----+
-+   |uint  | Y  | Y  | N  | N  |
-+   +------+----+----+----+----+
-+   |float | -  | Y  | N  | N  |
-+   +------+----+----+----+----+
-+   |poly  | Y  | Y  | -  | -  |
-+   +------+----+----+----+----+
-+
-+   Create 3 element structures of:
-+
-+   +------+----+----+----+----+
-+   |      | 8  | 16 | 32 | 64 |
-+   +------+----+----+----+----+
-+   |int   | Y  | Y  | Y  | Y  |
-+   +------+----+----+----+----+
-+   |uint  | Y  | Y  | Y  | Y  |
-+   +------+----+----+----+----+
-+   |float | -  | Y  | Y  | Y  |
-+   +------+----+----+----+----+
-+   |poly  | Y  | Y  | -  | -  |
-+   +------+----+----+----+----+
-+
-+   Create 4 element structures of:
-+
-+   +------+----+----+----+----+
-+   |      | 8  | 16 | 32 | 64 |
-+   +------+----+----+----+----+
-+   |int   | Y  | N  | N  | Y  |
-+   +------+----+----+----+----+
-+   |uint  | Y  | N  | N  | Y  |
-+   +------+----+----+----+----+
-+   |float | -  | N  | N  | Y  |
-+   +------+----+----+----+----+
-+   |poly  | Y  | N  | -  | -  |
-+   +------+----+----+----+----+
-+
-+  This is required for casting memory reference.  */
-+#define __STRUCTN(t, sz, nelem)			\
-+  typedef struct t ## sz ## x ## nelem ## _t {	\
-+    t ## sz ## _t val[nelem];			\
-+  }  t ## sz ## x ## nelem ## _t;
-+
-+/* 2-element structs.  */
-+__STRUCTN (int, 8, 2)
-+__STRUCTN (int, 16, 2)
-+__STRUCTN (uint, 8, 2)
-+__STRUCTN (uint, 16, 2)
-+__STRUCTN (float, 16, 2)
-+__STRUCTN (poly, 8, 2)
-+__STRUCTN (poly, 16, 2)
-+/* 3-element structs.  */
-+__STRUCTN (int, 8, 3)
-+__STRUCTN (int, 16, 3)
-+__STRUCTN (int, 32, 3)
-+__STRUCTN (int, 64, 3)
-+__STRUCTN (uint, 8, 3)
-+__STRUCTN (uint, 16, 3)
-+__STRUCTN (uint, 32, 3)
-+__STRUCTN (uint, 64, 3)
-+__STRUCTN (float, 16, 3)
-+__STRUCTN (float, 32, 3)
-+__STRUCTN (float, 64, 3)
-+__STRUCTN (poly, 8, 3)
-+__STRUCTN (poly, 16, 3)
-+/* 4-element structs.  */
-+__STRUCTN (int, 8, 4)
-+__STRUCTN (int, 64, 4)
-+__STRUCTN (uint, 8, 4)
-+__STRUCTN (uint, 64, 4)
-+__STRUCTN (poly, 8, 4)
-+__STRUCTN (float, 64, 4)
-+#undef __STRUCTN
-+
-+
-+#define __ST2_LANE_FUNC(intype, largetype, ptrtype, mode,		     \
-+			qmode, ptr_mode, funcsuffix, signedtype)	     \
-+__extension__ extern __inline void					     \
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
-+vst2_lane_ ## funcsuffix (ptrtype *__ptr,				     \
-+			  intype __b, const int __c)			     \
-+{									     \
-+  __builtin_aarch64_simd_oi __o;					     \
-+  largetype __temp;							     \
-+  __temp.val[0]								     \
-+    = vcombine_##funcsuffix (__b.val[0],				     \
-+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-+  __temp.val[1]								     \
-+    = vcombine_##funcsuffix (__b.val[1],				     \
-+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-+  __o = __builtin_aarch64_set_qregoi##qmode (__o,			     \
-+					     (signedtype) __temp.val[0], 0); \
-+  __o = __builtin_aarch64_set_qregoi##qmode (__o,			     \
-+					     (signedtype) __temp.val[1], 1); \
-+  __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *)  \
-+				     __ptr, __o, __c);			     \
-+}
-+
-+__ST2_LANE_FUNC (float16x4x2_t, float16x8x2_t, float16_t, v4hf, v8hf, hf, f16,
-+		 float16x8_t)
-+__ST2_LANE_FUNC (float32x2x2_t, float32x4x2_t, float32_t, v2sf, v4sf, sf, f32,
-+		 float32x4_t)
-+__ST2_LANE_FUNC (float64x1x2_t, float64x2x2_t, float64_t, df, v2df, df, f64,
-+		 float64x2_t)
-+__ST2_LANE_FUNC (poly8x8x2_t, poly8x16x2_t, poly8_t, v8qi, v16qi, qi, p8,
-+		 int8x16_t)
-+__ST2_LANE_FUNC (poly16x4x2_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi, p16,
-+		 int16x8_t)
-+__ST2_LANE_FUNC (poly64x1x2_t, poly64x2x2_t, poly64_t, di, v2di_ssps, di, p64,
-+		 poly64x2_t)
-+__ST2_LANE_FUNC (int8x8x2_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8,
-+		 int8x16_t)
-+__ST2_LANE_FUNC (int16x4x2_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16,
-+		 int16x8_t)
-+__ST2_LANE_FUNC (int32x2x2_t, int32x4x2_t, int32_t, v2si, v4si, si, s32,
-+		 int32x4_t)
-+__ST2_LANE_FUNC (int64x1x2_t, int64x2x2_t, int64_t, di, v2di, di, s64,
-+		 int64x2_t)
-+__ST2_LANE_FUNC (uint8x8x2_t, uint8x16x2_t, uint8_t, v8qi, v16qi, qi, u8,
-+		 int8x16_t)
-+__ST2_LANE_FUNC (uint16x4x2_t, uint16x8x2_t, uint16_t, v4hi, v8hi, hi, u16,
-+		 int16x8_t)
-+__ST2_LANE_FUNC (uint32x2x2_t, uint32x4x2_t, uint32_t, v2si, v4si, si, u32,
-+		 int32x4_t)
-+__ST2_LANE_FUNC (uint64x1x2_t, uint64x2x2_t, uint64_t, di, v2di, di, u64,
-+		 int64x2_t)
-+
-+#undef __ST2_LANE_FUNC
-+#define __ST2_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix)	    \
-+__extension__ extern __inline void					    \
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
-+vst2q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
-+			   intype __b, const int __c)			    \
-+{									    \
-+  union { intype __i;							    \
-+	  __builtin_aarch64_simd_oi __o; } __temp = { __b };		    \
-+  __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
-+				    __ptr, __temp.__o, __c);		    \
-+}
-+
-+__ST2_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16)
-+__ST2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
-+__ST2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
-+__ST2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
-+__ST2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
-+__ST2_LANE_FUNC (poly64x2x2_t, poly64_t, v2di, di, p64)
-+__ST2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
-+__ST2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
-+__ST2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
-+__ST2_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64)
-+__ST2_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8)
-+__ST2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16)
-+__ST2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32)
-+__ST2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64)
-+
-+#define __ST3_LANE_FUNC(intype, largetype, ptrtype, mode,		     \
-+			qmode, ptr_mode, funcsuffix, signedtype)	     \
-+__extension__ extern __inline void					     \
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
-+vst3_lane_ ## funcsuffix (ptrtype *__ptr,				     \
-+			  intype __b, const int __c)			     \
-+{									     \
-+  __builtin_aarch64_simd_ci __o;					     \
-+  largetype __temp;							     \
-+  __temp.val[0]								     \
-+    = vcombine_##funcsuffix (__b.val[0],				     \
-+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-+  __temp.val[1]								     \
-+    = vcombine_##funcsuffix (__b.val[1],				     \
-+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-+  __temp.val[2]								     \
-+    = vcombine_##funcsuffix (__b.val[2],				     \
-+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-+  __o = __builtin_aarch64_set_qregci##qmode (__o,			     \
-+					     (signedtype) __temp.val[0], 0); \
-+  __o = __builtin_aarch64_set_qregci##qmode (__o,			     \
-+					     (signedtype) __temp.val[1], 1); \
-+  __o = __builtin_aarch64_set_qregci##qmode (__o,			     \
-+					     (signedtype) __temp.val[2], 2); \
-+  __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *)  \
-+				     __ptr, __o, __c);			     \
-+}
-+
-+__ST3_LANE_FUNC (float16x4x3_t, float16x8x3_t, float16_t, v4hf, v8hf, hf, f16,
-+		 float16x8_t)
-+__ST3_LANE_FUNC (float32x2x3_t, float32x4x3_t, float32_t, v2sf, v4sf, sf, f32,
-+		 float32x4_t)
-+__ST3_LANE_FUNC (float64x1x3_t, float64x2x3_t, float64_t, df, v2df, df, f64,
-+		 float64x2_t)
-+__ST3_LANE_FUNC (poly8x8x3_t, poly8x16x3_t, poly8_t, v8qi, v16qi, qi, p8,
-+		 int8x16_t)
-+__ST3_LANE_FUNC (poly16x4x3_t, poly16x8x3_t, poly16_t, v4hi, v8hi, hi, p16,
-+		 int16x8_t)
-+__ST3_LANE_FUNC (poly64x1x3_t, poly64x2x3_t, poly64_t, di, v2di_ssps, di, p64,
-+		 poly64x2_t)
-+__ST3_LANE_FUNC (int8x8x3_t, int8x16x3_t, int8_t, v8qi, v16qi, qi, s8,
-+		 int8x16_t)
-+__ST3_LANE_FUNC (int16x4x3_t, int16x8x3_t, int16_t, v4hi, v8hi, hi, s16,
-+		 int16x8_t)
-+__ST3_LANE_FUNC (int32x2x3_t, int32x4x3_t, int32_t, v2si, v4si, si, s32,
-+		 int32x4_t)
-+__ST3_LANE_FUNC (int64x1x3_t, int64x2x3_t, int64_t, di, v2di, di, s64,
-+		 int64x2_t)
-+__ST3_LANE_FUNC (uint8x8x3_t, uint8x16x3_t, uint8_t, v8qi, v16qi, qi, u8,
-+		 int8x16_t)
-+__ST3_LANE_FUNC (uint16x4x3_t, uint16x8x3_t, uint16_t, v4hi, v8hi, hi, u16,
-+		 int16x8_t)
-+__ST3_LANE_FUNC (uint32x2x3_t, uint32x4x3_t, uint32_t, v2si, v4si, si, u32,
-+		 int32x4_t)
-+__ST3_LANE_FUNC (uint64x1x3_t, uint64x2x3_t, uint64_t, di, v2di, di, u64,
-+		 int64x2_t)
-+
-+#undef __ST3_LANE_FUNC
-+#define __ST3_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix)	    \
-+__extension__ extern __inline void					    \
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
-+vst3q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
-+			   intype __b, const int __c)			    \
-+{									    \
-+  union { intype __i;							    \
-+	  __builtin_aarch64_simd_ci __o; } __temp = { __b };		    \
-+  __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
-+				    __ptr, __temp.__o, __c);		    \
-+}
-+
-+__ST3_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16)
-+__ST3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
-+__ST3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
-+__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
-+__ST3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16)
-+__ST3_LANE_FUNC (poly64x2x3_t, poly64_t, v2di, di, p64)
-+__ST3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8)
-+__ST3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16)
-+__ST3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32)
-+__ST3_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64)
-+__ST3_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8)
-+__ST3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16)
-+__ST3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32)
-+__ST3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64)
-+
-+#define __ST4_LANE_FUNC(intype, largetype, ptrtype, mode,		     \
-+			qmode, ptr_mode, funcsuffix, signedtype)	     \
-+__extension__ extern __inline void					     \
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
-+vst4_lane_ ## funcsuffix (ptrtype *__ptr,				     \
-+			  intype __b, const int __c)			     \
-+{									     \
-+  __builtin_aarch64_simd_xi __o;					     \
-+  largetype __temp;							     \
-+  __temp.val[0]								     \
-+    = vcombine_##funcsuffix (__b.val[0],				     \
-+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-+  __temp.val[1]								     \
-+    = vcombine_##funcsuffix (__b.val[1],				     \
-+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-+  __temp.val[2]								     \
-+    = vcombine_##funcsuffix (__b.val[2],				     \
-+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-+  __temp.val[3]								     \
-+    = vcombine_##funcsuffix (__b.val[3],				     \
-+			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-+  __o = __builtin_aarch64_set_qregxi##qmode (__o,			     \
-+					     (signedtype) __temp.val[0], 0); \
-+  __o = __builtin_aarch64_set_qregxi##qmode (__o,			     \
-+					     (signedtype) __temp.val[1], 1); \
-+  __o = __builtin_aarch64_set_qregxi##qmode (__o,			     \
-+					     (signedtype) __temp.val[2], 2); \
-+  __o = __builtin_aarch64_set_qregxi##qmode (__o,			     \
-+					     (signedtype) __temp.val[3], 3); \
-+  __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *)  \
-+				     __ptr, __o, __c);			     \
-+}
-+
-+__ST4_LANE_FUNC (float16x4x4_t, float16x8x4_t, float16_t, v4hf, v8hf, hf, f16,
-+		 float16x8_t)
-+__ST4_LANE_FUNC (float32x2x4_t, float32x4x4_t, float32_t, v2sf, v4sf, sf, f32,
-+		 float32x4_t)
-+__ST4_LANE_FUNC (float64x1x4_t, float64x2x4_t, float64_t, df, v2df, df, f64,
-+		 float64x2_t)
-+__ST4_LANE_FUNC (poly8x8x4_t, poly8x16x4_t, poly8_t, v8qi, v16qi, qi, p8,
-+		 int8x16_t)
-+__ST4_LANE_FUNC (poly16x4x4_t, poly16x8x4_t, poly16_t, v4hi, v8hi, hi, p16,
-+		 int16x8_t)
-+__ST4_LANE_FUNC (poly64x1x4_t, poly64x2x4_t, poly64_t, di, v2di_ssps, di, p64,
-+		 poly64x2_t)
-+__ST4_LANE_FUNC (int8x8x4_t, int8x16x4_t, int8_t, v8qi, v16qi, qi, s8,
-+		 int8x16_t)
-+__ST4_LANE_FUNC (int16x4x4_t, int16x8x4_t, int16_t, v4hi, v8hi, hi, s16,
-+		 int16x8_t)
-+__ST4_LANE_FUNC (int32x2x4_t, int32x4x4_t, int32_t, v2si, v4si, si, s32,
-+		 int32x4_t)
-+__ST4_LANE_FUNC (int64x1x4_t, int64x2x4_t, int64_t, di, v2di, di, s64,
-+		 int64x2_t)
-+__ST4_LANE_FUNC (uint8x8x4_t, uint8x16x4_t, uint8_t, v8qi, v16qi, qi, u8,
-+		 int8x16_t)
-+__ST4_LANE_FUNC (uint16x4x4_t, uint16x8x4_t, uint16_t, v4hi, v8hi, hi, u16,
-+		 int16x8_t)
-+__ST4_LANE_FUNC (uint32x2x4_t, uint32x4x4_t, uint32_t, v2si, v4si, si, u32,
-+		 int32x4_t)
-+__ST4_LANE_FUNC (uint64x1x4_t, uint64x2x4_t, uint64_t, di, v2di, di, u64,
-+		 int64x2_t)
-+
-+#undef __ST4_LANE_FUNC
-+#define __ST4_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix)	    \
-+__extension__ extern __inline void					    \
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
-+vst4q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
-+			   intype __b, const int __c)			    \
-+{									    \
-+  union { intype __i;							    \
-+	  __builtin_aarch64_simd_xi __o; } __temp = { __b };		    \
-+  __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
-+				    __ptr, __temp.__o, __c);		    \
-+}
-+
-+__ST4_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16)
-+__ST4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
-+__ST4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
-+__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
-+__ST4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16)
-+__ST4_LANE_FUNC (poly64x2x4_t, poly64_t, v2di, di, p64)
-+__ST4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8)
-+__ST4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16)
-+__ST4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32)
-+__ST4_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64)
-+__ST4_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8)
-+__ST4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16)
-+__ST4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32)
-+__ST4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64)
-+
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaddlv_s32 (int32x2_t a)
-+{
-+  int64_t result;
-+  __asm__ ("saddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : );
-+  return result;
-+}
-+
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaddlv_u32 (uint32x2_t a)
-+{
-+  uint64_t result;
-+  __asm__ ("uaddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : );
-+  return result;
-+}
-+
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulh_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c)
-+{
-+  return __builtin_aarch64_sqdmulh_laneqv4hi (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulh_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c)
-+{
-+  return __builtin_aarch64_sqdmulh_laneqv2si (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulhq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
-+{
-+  return __builtin_aarch64_sqdmulh_laneqv8hi (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
-+{
-+  return __builtin_aarch64_sqdmulh_laneqv4si (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulh_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c)
-+{
-+  return  __builtin_aarch64_sqrdmulh_laneqv4hi (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulh_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c)
-+{
-+  return __builtin_aarch64_sqrdmulh_laneqv2si (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulhq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
-+{
-+  return __builtin_aarch64_sqrdmulh_laneqv8hi (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
-+{
-+  return __builtin_aarch64_sqrdmulh_laneqv4si (__a, __b, __c);
-+}
-+
-+/* Table intrinsics.  */
-+
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbl1_p8 (poly8x16_t a, uint8x8_t b)
-+{
-+  poly8x8_t result;
-+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-+           : "=w"(result)
-+           : "w"(a), "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbl1_s8 (int8x16_t a, uint8x8_t b)
-+{
-+  int8x8_t result;
-+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-+           : "=w"(result)
-+           : "w"(a), "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbl1_u8 (uint8x16_t a, uint8x8_t b)
-+{
-+  uint8x8_t result;
-+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-+           : "=w"(result)
-+           : "w"(a), "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbl1q_p8 (poly8x16_t a, uint8x16_t b)
-+{
-+  poly8x16_t result;
-+  __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
-+           : "=w"(result)
-+           : "w"(a), "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbl1q_s8 (int8x16_t a, uint8x16_t b)
-+{
-+  int8x16_t result;
-+  __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
-+           : "=w"(result)
-+           : "w"(a), "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbl1q_u8 (uint8x16_t a, uint8x16_t b)
-+{
-+  uint8x16_t result;
-+  __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
-+           : "=w"(result)
-+           : "w"(a), "w"(b)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbx1_s8 (int8x8_t r, int8x16_t tab, uint8x8_t idx)
-+{
-+  int8x8_t result = r;
-+  __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
-+           : "+w"(result)
-+           : "w"(tab), "w"(idx)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbx1_u8 (uint8x8_t r, uint8x16_t tab, uint8x8_t idx)
-+{
-+  uint8x8_t result = r;
-+  __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
-+           : "+w"(result)
-+           : "w"(tab), "w"(idx)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbx1_p8 (poly8x8_t r, poly8x16_t tab, uint8x8_t idx)
-+{
-+  poly8x8_t result = r;
-+  __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
-+           : "+w"(result)
-+           : "w"(tab), "w"(idx)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbx1q_s8 (int8x16_t r, int8x16_t tab, uint8x16_t idx)
-+{
-+  int8x16_t result = r;
-+  __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
-+           : "+w"(result)
-+           : "w"(tab), "w"(idx)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbx1q_u8 (uint8x16_t r, uint8x16_t tab, uint8x16_t idx)
-+{
-+  uint8x16_t result = r;
-+  __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
-+           : "+w"(result)
-+           : "w"(tab), "w"(idx)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbx1q_p8 (poly8x16_t r, poly8x16_t tab, uint8x16_t idx)
-+{
-+  poly8x16_t result = r;
-+  __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
-+           : "+w"(result)
-+           : "w"(tab), "w"(idx)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+/* V7 legacy table intrinsics.  */
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtbl1_s8 (int8x8_t tab, int8x8_t idx)
-+{
-+  int8x8_t result;
-+  int8x16_t temp = vcombine_s8 (tab, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-+           : "=w"(result)
-+           : "w"(temp), "w"(idx)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtbl1_u8 (uint8x8_t tab, uint8x8_t idx)
-+{
-   uint8x8_t result;
--  __asm__ ("mvn %0.8b,%1.8b"
-+  uint8x16_t temp = vcombine_u8 (tab, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-            : "=w"(result)
--           : "w"(a)
-+           : "w"(temp), "w"(idx)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtbl1_p8 (poly8x8_t tab, uint8x8_t idx)
-+{
-+  poly8x8_t result;
-+  poly8x16_t temp = vcombine_p8 (tab, vcreate_p8 (__AARCH64_UINT64_C (0x0)));
-+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-+           : "=w"(result)
-+           : "w"(temp), "w"(idx)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtbl2_s8 (int8x8x2_t tab, int8x8_t idx)
-+{
-+  int8x8_t result;
-+  int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]);
-+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-+           : "=w"(result)
-+           : "w"(temp), "w"(idx)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtbl2_u8 (uint8x8x2_t tab, uint8x8_t idx)
-+{
-+  uint8x8_t result;
-+  uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]);
-+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-+           : "=w"(result)
-+           : "w"(temp), "w"(idx)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtbl2_p8 (poly8x8x2_t tab, uint8x8_t idx)
-+{
-+  poly8x8_t result;
-+  poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]);
-+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-+           : "=w"(result)
-+           : "w"(temp), "w"(idx)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtbl3_s8 (int8x8x3_t tab, int8x8_t idx)
-+{
-+  int8x8_t result;
-+  int8x16x2_t temp;
-+  __builtin_aarch64_simd_oi __o;
-+  temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
-+  temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0)));
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-+					   (int8x16_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-+					   (int8x16_t) temp.val[1], 1);
-+  result = __builtin_aarch64_tbl3v8qi (__o, idx);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtbl3_u8 (uint8x8x3_t tab, uint8x8_t idx)
-+{
-+  uint8x8_t result;
-+  uint8x16x2_t temp;
-+  __builtin_aarch64_simd_oi __o;
-+  temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
-+  temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0)));
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-+					   (int8x16_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-+					   (int8x16_t) temp.val[1], 1);
-+  result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
-+  return result;
-+}
-+
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtbl3_p8 (poly8x8x3_t tab, uint8x8_t idx)
-+{
-+  poly8x8_t result;
-+  poly8x16x2_t temp;
-+  __builtin_aarch64_simd_oi __o;
-+  temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
-+  temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0)));
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-+					   (int8x16_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-+					   (int8x16_t) temp.val[1], 1);
-+  result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
-+  return result;
-+}
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtbl4_s8 (int8x8x4_t tab, int8x8_t idx)
-+{
-+  int8x8_t result;
-+  int8x16x2_t temp;
-+  __builtin_aarch64_simd_oi __o;
-+  temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
-+  temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-+					   (int8x16_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-+					   (int8x16_t) temp.val[1], 1);
-+  result = __builtin_aarch64_tbl3v8qi (__o, idx);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtbl4_u8 (uint8x8x4_t tab, uint8x8_t idx)
-+{
-+  uint8x8_t result;
-+  uint8x16x2_t temp;
-+  __builtin_aarch64_simd_oi __o;
-+  temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
-+  temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-+					   (int8x16_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-+					   (int8x16_t) temp.val[1], 1);
-+  result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
-+  return result;
-+}
-+
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtbl4_p8 (poly8x8x4_t tab, uint8x8_t idx)
-+{
-+  poly8x8_t result;
-+  poly8x16x2_t temp;
-+  __builtin_aarch64_simd_oi __o;
-+  temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
-+  temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-+					   (int8x16_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-+					   (int8x16_t) temp.val[1], 1);
-+  result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
-+  return result;
-+}
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtbx2_s8 (int8x8_t r, int8x8x2_t tab, int8x8_t idx)
-+{
-+  int8x8_t result = r;
-+  int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]);
-+  __asm__ ("tbx %0.8b, {%1.16b}, %2.8b"
-+           : "+w"(result)
-+           : "w"(temp), "w"(idx)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtbx2_u8 (uint8x8_t r, uint8x8x2_t tab, uint8x8_t idx)
-+{
-+  uint8x8_t result = r;
-+  uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]);
-+  __asm__ ("tbx %0.8b, {%1.16b}, %2.8b"
-+           : "+w"(result)
-+           : "w"(temp), "w"(idx)
-+           : /* No clobbers */);
-+  return result;
-+}
-+
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx)
-+{
-+  poly8x8_t result = r;
-+  poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]);
-+  __asm__ ("tbx %0.8b, {%1.16b}, %2.8b"
-+           : "+w"(result)
-+           : "w"(temp), "w"(idx)
-            : /* No clobbers */);
-   return result;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vmvn_u16 (uint16x4_t a)
-+/* End of temporary inline asm.  */
-+
-+/* Start of optimal implementations in approved order.  */
-+
-+/* vabd.  */
-+
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vabds_f32 (float32_t __a, float32_t __b)
-+{
-+  return __builtin_aarch64_fabdsf (__a, __b);
-+}
-+
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vabdd_f64 (float64_t __a, float64_t __b)
-+{
-+  return __builtin_aarch64_fabddf (__a, __b);
-+}
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vabd_f32 (float32x2_t __a, float32x2_t __b)
-+{
-+  return __builtin_aarch64_fabdv2sf (__a, __b);
-+}
-+
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vabd_f64 (float64x1_t __a, float64x1_t __b)
-+{
-+  return (float64x1_t) {vabdd_f64 (vget_lane_f64 (__a, 0),
-+				   vget_lane_f64 (__b, 0))};
-+}
-+
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vabdq_f32 (float32x4_t __a, float32x4_t __b)
-+{
-+  return __builtin_aarch64_fabdv4sf (__a, __b);
-+}
-+
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vabdq_f64 (float64x2_t __a, float64x2_t __b)
-+{
-+  return __builtin_aarch64_fabdv2df (__a, __b);
-+}
-+
-+/* vabs  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vabs_f32 (float32x2_t __a)
-+{
-+  return __builtin_aarch64_absv2sf (__a);
-+}
-+
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vabs_f64 (float64x1_t __a)
-+{
-+  return (float64x1_t) {__builtin_fabs (__a[0])};
-+}
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vabs_s8 (int8x8_t __a)
-+{
-+  return __builtin_aarch64_absv8qi (__a);
-+}
-+
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vabs_s16 (int16x4_t __a)
-+{
-+  return __builtin_aarch64_absv4hi (__a);
-+}
-+
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vabs_s32 (int32x2_t __a)
-+{
-+  return __builtin_aarch64_absv2si (__a);
-+}
-+
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vabs_s64 (int64x1_t __a)
-+{
-+  return (int64x1_t) {__builtin_aarch64_absdi (__a[0])};
-+}
-+
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vabsq_f32 (float32x4_t __a)
-+{
-+  return __builtin_aarch64_absv4sf (__a);
-+}
-+
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vabsq_f64 (float64x2_t __a)
-+{
-+  return __builtin_aarch64_absv2df (__a);
-+}
-+
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vabsq_s8 (int8x16_t __a)
-+{
-+  return __builtin_aarch64_absv16qi (__a);
-+}
-+
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vabsq_s16 (int16x8_t __a)
-+{
-+  return __builtin_aarch64_absv8hi (__a);
-+}
-+
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vabsq_s32 (int32x4_t __a)
-+{
-+  return __builtin_aarch64_absv4si (__a);
-+}
-+
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vabsq_s64 (int64x2_t __a)
-+{
-+  return __builtin_aarch64_absv2di (__a);
-+}
-+
-+/* vadd */
-+
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaddd_s64 (int64_t __a, int64_t __b)
-+{
-+  return __a + __b;
-+}
-+
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaddd_u64 (uint64_t __a, uint64_t __b)
-+{
-+  return __a + __b;
-+}
-+
-+/* vaddv */
-+
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaddv_s8 (int8x8_t __a)
-+{
-+  return __builtin_aarch64_reduc_plus_scal_v8qi (__a);
-+}
-+
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaddv_s16 (int16x4_t __a)
-+{
-+  return __builtin_aarch64_reduc_plus_scal_v4hi (__a);
-+}
-+
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaddv_s32 (int32x2_t __a)
-+{
-+  return __builtin_aarch64_reduc_plus_scal_v2si (__a);
-+}
-+
-+__extension__ extern __inline uint8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaddv_u8 (uint8x8_t __a)
-+{
-+  return (uint8_t) __builtin_aarch64_reduc_plus_scal_v8qi ((int8x8_t) __a);
-+}
-+
-+__extension__ extern __inline uint16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaddv_u16 (uint16x4_t __a)
-+{
-+  return (uint16_t) __builtin_aarch64_reduc_plus_scal_v4hi ((int16x4_t) __a);
-+}
-+
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaddv_u32 (uint32x2_t __a)
-+{
-+  return (int32_t) __builtin_aarch64_reduc_plus_scal_v2si ((int32x2_t) __a);
-+}
-+
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaddvq_s8 (int8x16_t __a)
-+{
-+  return __builtin_aarch64_reduc_plus_scal_v16qi (__a);
-+}
-+
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaddvq_s16 (int16x8_t __a)
-+{
-+  return __builtin_aarch64_reduc_plus_scal_v8hi (__a);
-+}
-+
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaddvq_s32 (int32x4_t __a)
-+{
-+  return __builtin_aarch64_reduc_plus_scal_v4si (__a);
-+}
-+
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaddvq_s64 (int64x2_t __a)
-+{
-+  return __builtin_aarch64_reduc_plus_scal_v2di (__a);
-+}
-+
-+__extension__ extern __inline uint8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaddvq_u8 (uint8x16_t __a)
-+{
-+  return (uint8_t) __builtin_aarch64_reduc_plus_scal_v16qi ((int8x16_t) __a);
-+}
-+
-+__extension__ extern __inline uint16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaddvq_u16 (uint16x8_t __a)
-+{
-+  return (uint16_t) __builtin_aarch64_reduc_plus_scal_v8hi ((int16x8_t) __a);
-+}
-+
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaddvq_u32 (uint32x4_t __a)
-+{
-+  return (uint32_t) __builtin_aarch64_reduc_plus_scal_v4si ((int32x4_t) __a);
-+}
-+
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaddvq_u64 (uint64x2_t __a)
-+{
-+  return (uint64_t) __builtin_aarch64_reduc_plus_scal_v2di ((int64x2_t) __a);
-+}
-+
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaddv_f32 (float32x2_t __a)
-+{
-+  return __builtin_aarch64_reduc_plus_scal_v2sf (__a);
-+}
-+
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaddvq_f32 (float32x4_t __a)
-+{
-+  return __builtin_aarch64_reduc_plus_scal_v4sf (__a);
-+}
-+
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaddvq_f64 (float64x2_t __a)
-+{
-+  return __builtin_aarch64_reduc_plus_scal_v2df (__a);
-+}
-+
-+/* vbsl  */
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbsl_f16 (uint16x4_t __a, float16x4_t __b, float16x4_t __c)
-+{
-+  return __builtin_aarch64_simd_bslv4hf_suss (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbsl_f32 (uint32x2_t __a, float32x2_t __b, float32x2_t __c)
-+{
-+  return __builtin_aarch64_simd_bslv2sf_suss (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbsl_f64 (uint64x1_t __a, float64x1_t __b, float64x1_t __c)
-+{
-+  return (float64x1_t)
-+    { __builtin_aarch64_simd_bsldf_suss (__a[0], __b[0], __c[0]) };
-+}
-+
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbsl_p8 (uint8x8_t __a, poly8x8_t __b, poly8x8_t __c)
-+{
-+  return __builtin_aarch64_simd_bslv8qi_pupp (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbsl_p16 (uint16x4_t __a, poly16x4_t __b, poly16x4_t __c)
-+{
-+  return __builtin_aarch64_simd_bslv4hi_pupp (__a, __b, __c);
-+}
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbsl_p64 (uint64x1_t __a, poly64x1_t __b, poly64x1_t __c)
-+{
-+  return (poly64x1_t)
-+      {__builtin_aarch64_simd_bsldi_pupp (__a[0], __b[0], __c[0])};
-+}
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbsl_s8 (uint8x8_t __a, int8x8_t __b, int8x8_t __c)
-+{
-+  return __builtin_aarch64_simd_bslv8qi_suss (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbsl_s16 (uint16x4_t __a, int16x4_t __b, int16x4_t __c)
-+{
-+  return __builtin_aarch64_simd_bslv4hi_suss (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbsl_s32 (uint32x2_t __a, int32x2_t __b, int32x2_t __c)
-+{
-+  return __builtin_aarch64_simd_bslv2si_suss (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbsl_s64 (uint64x1_t __a, int64x1_t __b, int64x1_t __c)
-+{
-+  return (int64x1_t)
-+      {__builtin_aarch64_simd_bsldi_suss (__a[0], __b[0], __c[0])};
-+}
-+
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbsl_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
-+{
-+  return __builtin_aarch64_simd_bslv8qi_uuuu (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbsl_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
-+{
-+  return __builtin_aarch64_simd_bslv4hi_uuuu (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbsl_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
-+{
-+  return __builtin_aarch64_simd_bslv2si_uuuu (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbsl_u64 (uint64x1_t __a, uint64x1_t __b, uint64x1_t __c)
-+{
-+  return (uint64x1_t)
-+      {__builtin_aarch64_simd_bsldi_uuuu (__a[0], __b[0], __c[0])};
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbslq_f16 (uint16x8_t __a, float16x8_t __b, float16x8_t __c)
-+{
-+  return __builtin_aarch64_simd_bslv8hf_suss (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbslq_f32 (uint32x4_t __a, float32x4_t __b, float32x4_t __c)
-+{
-+  return __builtin_aarch64_simd_bslv4sf_suss (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbslq_f64 (uint64x2_t __a, float64x2_t __b, float64x2_t __c)
-+{
-+  return __builtin_aarch64_simd_bslv2df_suss (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbslq_p8 (uint8x16_t __a, poly8x16_t __b, poly8x16_t __c)
-+{
-+  return __builtin_aarch64_simd_bslv16qi_pupp (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbslq_p16 (uint16x8_t __a, poly16x8_t __b, poly16x8_t __c)
-+{
-+  return __builtin_aarch64_simd_bslv8hi_pupp (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbslq_s8 (uint8x16_t __a, int8x16_t __b, int8x16_t __c)
-+{
-+  return __builtin_aarch64_simd_bslv16qi_suss (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbslq_s16 (uint16x8_t __a, int16x8_t __b, int16x8_t __c)
-+{
-+  return __builtin_aarch64_simd_bslv8hi_suss (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbslq_p64 (uint64x2_t __a, poly64x2_t __b, poly64x2_t __c)
-+{
-+  return __builtin_aarch64_simd_bslv2di_pupp (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbslq_s32 (uint32x4_t __a, int32x4_t __b, int32x4_t __c)
-+{
-+  return __builtin_aarch64_simd_bslv4si_suss (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbslq_s64 (uint64x2_t __a, int64x2_t __b, int64x2_t __c)
-+{
-+  return __builtin_aarch64_simd_bslv2di_suss (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbslq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
-+{
-+  return __builtin_aarch64_simd_bslv16qi_uuuu (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbslq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
-+{
-+  return __builtin_aarch64_simd_bslv8hi_uuuu (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbslq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
-+{
-+  return __builtin_aarch64_simd_bslv4si_uuuu (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vbslq_u64 (uint64x2_t __a, uint64x2_t __b, uint64x2_t __c)
-+{
-+  return __builtin_aarch64_simd_bslv2di_uuuu (__a, __b, __c);
-+}
-+
-+/* ARMv8.1-A instrinsics.  */
-+#pragma GCC push_options
-+#pragma GCC target ("arch=armv8.1-a")
-+
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlah_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
-+{
-+  return __builtin_aarch64_sqrdmlahv4hi (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlah_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
-+{
-+  return __builtin_aarch64_sqrdmlahv2si (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlahq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
-+{
-+  return __builtin_aarch64_sqrdmlahv8hi (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlahq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
-+{
-+  return __builtin_aarch64_sqrdmlahv4si (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlsh_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
-+{
-+  return __builtin_aarch64_sqrdmlshv4hi (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlsh_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
-+{
-+  return __builtin_aarch64_sqrdmlshv2si (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlshq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
-+{
-+  return __builtin_aarch64_sqrdmlshv8hi (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlshq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
-+{
-+  return __builtin_aarch64_sqrdmlshv4si (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlah_laneq_s16 (int16x4_t __a, int16x4_t __b, int16x8_t __c, const int __d)
-+{
-+  return  __builtin_aarch64_sqrdmlah_laneqv4hi (__a, __b, __c, __d);
-+}
-+
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlah_laneq_s32 (int32x2_t __a, int32x2_t __b, int32x4_t __c, const int __d)
-+{
-+  return __builtin_aarch64_sqrdmlah_laneqv2si (__a, __b, __c, __d);
-+}
-+
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlahq_laneq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c, const int __d)
-+{
-+  return __builtin_aarch64_sqrdmlah_laneqv8hi (__a, __b, __c, __d);
-+}
-+
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlahq_laneq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c, const int __d)
-+{
-+  return __builtin_aarch64_sqrdmlah_laneqv4si (__a, __b, __c, __d);
-+}
-+
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlsh_laneq_s16 (int16x4_t __a, int16x4_t __b, int16x8_t __c, const int __d)
-+{
-+  return  __builtin_aarch64_sqrdmlsh_laneqv4hi (__a, __b, __c, __d);
-+}
-+
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlsh_laneq_s32 (int32x2_t __a, int32x2_t __b, int32x4_t __c, const int __d)
-+{
-+  return __builtin_aarch64_sqrdmlsh_laneqv2si (__a, __b, __c, __d);
-+}
-+
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlshq_laneq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c, const int __d)
-+{
-+  return __builtin_aarch64_sqrdmlsh_laneqv8hi (__a, __b, __c, __d);
-+}
-+
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlshq_laneq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c, const int __d)
-+{
-+  return __builtin_aarch64_sqrdmlsh_laneqv4si (__a, __b, __c, __d);
-+}
-+
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlah_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
-+{
-+  return  __builtin_aarch64_sqrdmlah_lanev4hi (__a, __b, __c, __d);
-+}
-+
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlah_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
-+{
-+  return __builtin_aarch64_sqrdmlah_lanev2si (__a, __b, __c, __d);
-+}
-+
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlahq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
-+{
-+  return __builtin_aarch64_sqrdmlah_lanev8hi (__a, __b, __c, __d);
-+}
-+
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlahq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
-+{
-+  return __builtin_aarch64_sqrdmlah_lanev4si (__a, __b, __c, __d);
-+}
-+
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlahh_s16 (int16_t __a, int16_t __b, int16_t __c)
-+{
-+  return (int16_t) __builtin_aarch64_sqrdmlahhi (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlahh_lane_s16 (int16_t __a, int16_t __b, int16x4_t __c, const int __d)
-+{
-+  return __builtin_aarch64_sqrdmlah_lanehi (__a, __b, __c, __d);
-+}
-+
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlahh_laneq_s16 (int16_t __a, int16_t __b, int16x8_t __c, const int __d)
-+{
-+  return __builtin_aarch64_sqrdmlah_laneqhi (__a, __b, __c, __d);
-+}
-+
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlahs_s32 (int32_t __a, int32_t __b, int32_t __c)
-+{
-+  return (int32_t) __builtin_aarch64_sqrdmlahsi (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlahs_lane_s32 (int32_t __a, int32_t __b, int32x2_t __c, const int __d)
-+{
-+  return __builtin_aarch64_sqrdmlah_lanesi (__a, __b, __c, __d);
-+}
-+
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlahs_laneq_s32 (int32_t __a, int32_t __b, int32x4_t __c, const int __d)
-+{
-+  return __builtin_aarch64_sqrdmlah_laneqsi (__a, __b, __c, __d);
-+}
-+
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlsh_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
-+{
-+  return  __builtin_aarch64_sqrdmlsh_lanev4hi (__a, __b, __c, __d);
-+}
-+
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlsh_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
-+{
-+  return __builtin_aarch64_sqrdmlsh_lanev2si (__a, __b, __c, __d);
-+}
-+
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlshq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
-+{
-+  return __builtin_aarch64_sqrdmlsh_lanev8hi (__a, __b, __c, __d);
-+}
-+
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlshq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
-+{
-+  return __builtin_aarch64_sqrdmlsh_lanev4si (__a, __b, __c, __d);
-+}
-+
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlshh_s16 (int16_t __a, int16_t __b, int16_t __c)
-+{
-+  return (int16_t) __builtin_aarch64_sqrdmlshhi (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlshh_lane_s16 (int16_t __a, int16_t __b, int16x4_t __c, const int __d)
-+{
-+  return __builtin_aarch64_sqrdmlsh_lanehi (__a, __b, __c, __d);
-+}
-+
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlshh_laneq_s16 (int16_t __a, int16_t __b, int16x8_t __c, const int __d)
-+{
-+  return __builtin_aarch64_sqrdmlsh_laneqhi (__a, __b, __c, __d);
-+}
-+
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlshs_s32 (int32_t __a, int32_t __b, int32_t __c)
-+{
-+  return (int32_t) __builtin_aarch64_sqrdmlshsi (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlshs_lane_s32 (int32_t __a, int32_t __b, int32x2_t __c, const int __d)
-+{
-+  return __builtin_aarch64_sqrdmlsh_lanesi (__a, __b, __c, __d);
-+}
-+
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmlshs_laneq_s32 (int32_t __a, int32_t __b, int32x4_t __c, const int __d)
-+{
-+  return __builtin_aarch64_sqrdmlsh_laneqsi (__a, __b, __c, __d);
-+}
-+#pragma GCC pop_options
-+
-+#pragma GCC push_options
-+#pragma GCC target ("+nothing+crypto")
-+/* vaes  */
-+
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaeseq_u8 (uint8x16_t data, uint8x16_t key)
- {
--  uint16x4_t result;
--  __asm__ ("mvn %0.8b,%1.8b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return __builtin_aarch64_crypto_aesev16qi_uuu (data, key);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vmvn_u32 (uint32x2_t a)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaesdq_u8 (uint8x16_t data, uint8x16_t key)
- {
--  uint32x2_t result;
--  __asm__ ("mvn %0.8b,%1.8b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return __builtin_aarch64_crypto_aesdv16qi_uuu (data, key);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vmvnq_p8 (poly8x16_t a)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaesmcq_u8 (uint8x16_t data)
- {
--  poly8x16_t result;
--  __asm__ ("mvn %0.16b,%1.16b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return __builtin_aarch64_crypto_aesmcv16qi_uu (data);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vmvnq_s8 (int8x16_t a)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaesimcq_u8 (uint8x16_t data)
- {
--  int8x16_t result;
--  __asm__ ("mvn %0.16b,%1.16b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return __builtin_aarch64_crypto_aesimcv16qi_uu (data);
- }
-+#pragma GCC pop_options
-+
-+/* vcage  */
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmvnq_s16 (int16x8_t a)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcage_f64 (float64x1_t __a, float64x1_t __b)
- {
--  int16x8_t result;
--  __asm__ ("mvn %0.16b,%1.16b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return vabs_f64 (__a) >= vabs_f64 (__b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmvnq_s32 (int32x4_t a)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcages_f32 (float32_t __a, float32_t __b)
- {
--  int32x4_t result;
--  __asm__ ("mvn %0.16b,%1.16b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return __builtin_fabsf (__a) >= __builtin_fabsf (__b) ? -1 : 0;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vmvnq_u8 (uint8x16_t a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcage_f32 (float32x2_t __a, float32x2_t __b)
- {
--  uint8x16_t result;
--  __asm__ ("mvn %0.16b,%1.16b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return vabs_f32 (__a) >= vabs_f32 (__b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmvnq_u16 (uint16x8_t a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcageq_f32 (float32x4_t __a, float32x4_t __b)
- {
--  uint16x8_t result;
--  __asm__ ("mvn %0.16b,%1.16b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return vabsq_f32 (__a) >= vabsq_f32 (__b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmvnq_u32 (uint32x4_t a)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcaged_f64 (float64_t __a, float64_t __b)
- {
--  uint32x4_t result;
--  __asm__ ("mvn %0.16b,%1.16b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return __builtin_fabs (__a) >= __builtin_fabs (__b) ? -1 : 0;
-+}
-+
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcageq_f64 (float64x2_t __a, float64x2_t __b)
-+{
-+  return vabsq_f64 (__a) >= vabsq_f64 (__b);
- }
- 
-+/* vcagt  */
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vpadal_s8 (int16x4_t a, int8x8_t b)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcagts_f32 (float32_t __a, float32_t __b)
- {
--  int16x4_t result;
--  __asm__ ("sadalp %0.4h,%2.8b"
--           : "=w"(result)
--           : "0"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return __builtin_fabsf (__a) > __builtin_fabsf (__b) ? -1 : 0;
-+}
-+
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcagt_f32 (float32x2_t __a, float32x2_t __b)
-+{
-+  return vabs_f32 (__a) > vabs_f32 (__b);
-+}
-+
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcagt_f64 (float64x1_t __a, float64x1_t __b)
-+{
-+  return vabs_f64 (__a) > vabs_f64 (__b);
-+}
-+
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcagtq_f32 (float32x4_t __a, float32x4_t __b)
-+{
-+  return vabsq_f32 (__a) > vabsq_f32 (__b);
-+}
-+
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcagtd_f64 (float64_t __a, float64_t __b)
-+{
-+  return __builtin_fabs (__a) > __builtin_fabs (__b) ? -1 : 0;
-+}
-+
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcagtq_f64 (float64x2_t __a, float64x2_t __b)
-+{
-+  return vabsq_f64 (__a) > vabsq_f64 (__b);
-+}
-+
-+/* vcale  */
-+
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcale_f32 (float32x2_t __a, float32x2_t __b)
-+{
-+  return vabs_f32 (__a) <= vabs_f32 (__b);
-+}
-+
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcale_f64 (float64x1_t __a, float64x1_t __b)
-+{
-+  return vabs_f64 (__a) <= vabs_f64 (__b);
-+}
-+
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcaled_f64 (float64_t __a, float64_t __b)
-+{
-+  return __builtin_fabs (__a) <= __builtin_fabs (__b) ? -1 : 0;
-+}
-+
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcales_f32 (float32_t __a, float32_t __b)
-+{
-+  return __builtin_fabsf (__a) <= __builtin_fabsf (__b) ? -1 : 0;
-+}
-+
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcaleq_f32 (float32x4_t __a, float32x4_t __b)
-+{
-+  return vabsq_f32 (__a) <= vabsq_f32 (__b);
-+}
-+
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcaleq_f64 (float64x2_t __a, float64x2_t __b)
-+{
-+  return vabsq_f64 (__a) <= vabsq_f64 (__b);
-+}
-+
-+/* vcalt  */
-+
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcalt_f32 (float32x2_t __a, float32x2_t __b)
-+{
-+  return vabs_f32 (__a) < vabs_f32 (__b);
-+}
-+
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcalt_f64 (float64x1_t __a, float64x1_t __b)
-+{
-+  return vabs_f64 (__a) < vabs_f64 (__b);
-+}
-+
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcaltd_f64 (float64_t __a, float64_t __b)
-+{
-+  return __builtin_fabs (__a) < __builtin_fabs (__b) ? -1 : 0;
-+}
-+
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcaltq_f32 (float32x4_t __a, float32x4_t __b)
-+{
-+  return vabsq_f32 (__a) < vabsq_f32 (__b);
-+}
-+
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcaltq_f64 (float64x2_t __a, float64x2_t __b)
-+{
-+  return vabsq_f64 (__a) < vabsq_f64 (__b);
-+}
-+
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcalts_f32 (float32_t __a, float32_t __b)
-+{
-+  return __builtin_fabsf (__a) < __builtin_fabsf (__b) ? -1 : 0;
-+}
-+
-+/* vceq - vector.  */
-+
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_f32 (float32x2_t __a, float32x2_t __b)
-+{
-+  return (uint32x2_t) (__a == __b);
-+}
-+
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_f64 (float64x1_t __a, float64x1_t __b)
-+{
-+  return (uint64x1_t) (__a == __b);
-+}
-+
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_p8 (poly8x8_t __a, poly8x8_t __b)
-+{
-+  return (uint8x8_t) (__a == __b);
-+}
-+
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_p64 (poly64x1_t __a, poly64x1_t __b)
-+{
-+  return (uint64x1_t) (__a == __b);
-+}
-+
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_s8 (int8x8_t __a, int8x8_t __b)
-+{
-+  return (uint8x8_t) (__a == __b);
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_s16 (int16x4_t __a, int16x4_t __b)
-+{
-+  return (uint16x4_t) (__a == __b);
-+}
-+
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_s32 (int32x2_t __a, int32x2_t __b)
-+{
-+  return (uint32x2_t) (__a == __b);
-+}
-+
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_s64 (int64x1_t __a, int64x1_t __b)
-+{
-+  return (uint64x1_t) (__a == __b);
-+}
-+
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_u8 (uint8x8_t __a, uint8x8_t __b)
-+{
-+  return (__a == __b);
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_u16 (uint16x4_t __a, uint16x4_t __b)
-+{
-+  return (__a == __b);
-+}
-+
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_u32 (uint32x2_t __a, uint32x2_t __b)
-+{
-+  return (__a == __b);
-+}
-+
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_u64 (uint64x1_t __a, uint64x1_t __b)
-+{
-+  return (__a == __b);
-+}
-+
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqq_f32 (float32x4_t __a, float32x4_t __b)
-+{
-+  return (uint32x4_t) (__a == __b);
-+}
-+
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqq_f64 (float64x2_t __a, float64x2_t __b)
-+{
-+  return (uint64x2_t) (__a == __b);
-+}
-+
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqq_p8 (poly8x16_t __a, poly8x16_t __b)
-+{
-+  return (uint8x16_t) (__a == __b);
-+}
-+
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqq_s8 (int8x16_t __a, int8x16_t __b)
-+{
-+  return (uint8x16_t) (__a == __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vpadal_s16 (int32x2_t a, int16x4_t b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqq_s16 (int16x8_t __a, int16x8_t __b)
- {
--  int32x2_t result;
--  __asm__ ("sadalp %0.2s,%2.4h"
--           : "=w"(result)
--           : "0"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint16x8_t) (__a == __b);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vpadal_s32 (int64x1_t a, int32x2_t b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqq_s32 (int32x4_t __a, int32x4_t __b)
- {
--  int64x1_t result;
--  __asm__ ("sadalp %0.1d,%2.2s"
--           : "=w"(result)
--           : "0"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint32x4_t) (__a == __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vpadal_u8 (uint16x4_t a, uint8x8_t b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqq_s64 (int64x2_t __a, int64x2_t __b)
- {
--  uint16x4_t result;
--  __asm__ ("uadalp %0.4h,%2.8b"
--           : "=w"(result)
--           : "0"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint64x2_t) (__a == __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vpadal_u16 (uint32x2_t a, uint16x4_t b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
--  uint32x2_t result;
--  __asm__ ("uadalp %0.2s,%2.4h"
--           : "=w"(result)
--           : "0"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (__a == __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vpadal_u32 (uint64x1_t a, uint32x2_t b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
--  uint64x1_t result;
--  __asm__ ("uadalp %0.1d,%2.2s"
--           : "=w"(result)
--           : "0"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (__a == __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vpadalq_s8 (int16x8_t a, int8x16_t b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
--  int16x8_t result;
--  __asm__ ("sadalp %0.8h,%2.16b"
--           : "=w"(result)
--           : "0"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (__a == __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vpadalq_s16 (int32x4_t a, int16x8_t b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqq_u64 (uint64x2_t __a, uint64x2_t __b)
- {
--  int32x4_t result;
--  __asm__ ("sadalp %0.4s,%2.8h"
--           : "=w"(result)
--           : "0"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (__a == __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vpadalq_s32 (int64x2_t a, int32x4_t b)
-+/* vceq - scalar.  */
-+
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqs_f32 (float32_t __a, float32_t __b)
- {
--  int64x2_t result;
--  __asm__ ("sadalp %0.2d,%2.4s"
--           : "=w"(result)
--           : "0"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return __a == __b ? -1 : 0;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vpadalq_u8 (uint16x8_t a, uint8x16_t b)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqd_s64 (int64_t __a, int64_t __b)
- {
--  uint16x8_t result;
--  __asm__ ("uadalp %0.8h,%2.16b"
--           : "=w"(result)
--           : "0"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return __a == __b ? -1ll : 0ll;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vpadalq_u16 (uint32x4_t a, uint16x8_t b)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqd_u64 (uint64_t __a, uint64_t __b)
- {
--  uint32x4_t result;
--  __asm__ ("uadalp %0.4s,%2.8h"
--           : "=w"(result)
--           : "0"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return __a == __b ? -1ll : 0ll;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vpadalq_u32 (uint64x2_t a, uint32x4_t b)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqd_f64 (float64_t __a, float64_t __b)
- {
--  uint64x2_t result;
--  __asm__ ("uadalp %0.2d,%2.4s"
--           : "=w"(result)
--           : "0"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return __a == __b ? -1ll : 0ll;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vpadd_f32 (float32x2_t a, float32x2_t b)
-+/* vceqz - vector.  */
-+
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqz_f32 (float32x2_t __a)
- {
--  float32x2_t result;
--  __asm__ ("faddp %0.2s,%1.2s,%2.2s"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint32x2_t) (__a == 0.0f);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vpaddl_s8 (int8x8_t a)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqz_f64 (float64x1_t __a)
- {
--  int16x4_t result;
--  __asm__ ("saddlp %0.4h,%1.8b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return (uint64x1_t) (__a == (float64x1_t) {0.0});
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vpaddl_s16 (int16x4_t a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqz_p8 (poly8x8_t __a)
- {
--  int32x2_t result;
--  __asm__ ("saddlp %0.2s,%1.4h"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return (uint8x8_t) (__a == 0);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vpaddl_s32 (int32x2_t a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqz_s8 (int8x8_t __a)
- {
--  int64x1_t result;
--  __asm__ ("saddlp %0.1d,%1.2s"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return (uint8x8_t) (__a == 0);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vpaddl_u8 (uint8x8_t a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqz_s16 (int16x4_t __a)
- {
--  uint16x4_t result;
--  __asm__ ("uaddlp %0.4h,%1.8b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return (uint16x4_t) (__a == 0);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vpaddl_u16 (uint16x4_t a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqz_s32 (int32x2_t __a)
- {
--  uint32x2_t result;
--  __asm__ ("uaddlp %0.2s,%1.4h"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return (uint32x2_t) (__a == 0);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vpaddl_u32 (uint32x2_t a)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqz_s64 (int64x1_t __a)
- {
--  uint64x1_t result;
--  __asm__ ("uaddlp %0.1d,%1.2s"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return (uint64x1_t) (__a == __AARCH64_INT64_C (0));
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vpaddlq_s8 (int8x16_t a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqz_u8 (uint8x8_t __a)
- {
--  int16x8_t result;
--  __asm__ ("saddlp %0.8h,%1.16b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return (__a == 0);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vpaddlq_s16 (int16x8_t a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqz_u16 (uint16x4_t __a)
- {
--  int32x4_t result;
--  __asm__ ("saddlp %0.4s,%1.8h"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return (__a == 0);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vpaddlq_s32 (int32x4_t a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqz_u32 (uint32x2_t __a)
- {
--  int64x2_t result;
--  __asm__ ("saddlp %0.2d,%1.4s"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return (__a == 0);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vpaddlq_u8 (uint8x16_t a)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqz_u64 (uint64x1_t __a)
- {
--  uint16x8_t result;
--  __asm__ ("uaddlp %0.8h,%1.16b"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return (__a == __AARCH64_UINT64_C (0));
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vpaddlq_u16 (uint16x8_t a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzq_f32 (float32x4_t __a)
- {
--  uint32x4_t result;
--  __asm__ ("uaddlp %0.4s,%1.8h"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return (uint32x4_t) (__a == 0.0f);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vpaddlq_u32 (uint32x4_t a)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzq_f64 (float64x2_t __a)
- {
--  uint64x2_t result;
--  __asm__ ("uaddlp %0.2d,%1.4s"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return (uint64x2_t) (__a == 0.0f);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vpaddq_f32 (float32x4_t a, float32x4_t b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzq_p8 (poly8x16_t __a)
- {
--  float32x4_t result;
--  __asm__ ("faddp %0.4s,%1.4s,%2.4s"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint8x16_t) (__a == 0);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vpaddq_f64 (float64x2_t a, float64x2_t b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzq_s8 (int8x16_t __a)
- {
--  float64x2_t result;
--  __asm__ ("faddp %0.2d,%1.2d,%2.2d"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint8x16_t) (__a == 0);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vpaddq_s8 (int8x16_t a, int8x16_t b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzq_s16 (int16x8_t __a)
- {
--  int8x16_t result;
--  __asm__ ("addp %0.16b,%1.16b,%2.16b"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint16x8_t) (__a == 0);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vpaddq_s16 (int16x8_t a, int16x8_t b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzq_s32 (int32x4_t __a)
- {
--  int16x8_t result;
--  __asm__ ("addp %0.8h,%1.8h,%2.8h"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint32x4_t) (__a == 0);
-+}
-+
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzq_s64 (int64x2_t __a)
-+{
-+  return (uint64x2_t) (__a == __AARCH64_INT64_C (0));
-+}
-+
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzq_u8 (uint8x16_t __a)
-+{
-+  return (__a == 0);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vpaddq_s32 (int32x4_t a, int32x4_t b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzq_u16 (uint16x8_t __a)
- {
--  int32x4_t result;
--  __asm__ ("addp %0.4s,%1.4s,%2.4s"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (__a == 0);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vpaddq_s64 (int64x2_t a, int64x2_t b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzq_u32 (uint32x4_t __a)
- {
--  int64x2_t result;
--  __asm__ ("addp %0.2d,%1.2d,%2.2d"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (__a == 0);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vpaddq_u8 (uint8x16_t a, uint8x16_t b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzq_u64 (uint64x2_t __a)
- {
--  uint8x16_t result;
--  __asm__ ("addp %0.16b,%1.16b,%2.16b"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (__a == __AARCH64_UINT64_C (0));
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vpaddq_u16 (uint16x8_t a, uint16x8_t b)
-+/* vceqz - scalar.  */
-+
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzs_f32 (float32_t __a)
- {
--  uint16x8_t result;
--  __asm__ ("addp %0.8h,%1.8h,%2.8h"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return __a == 0.0f ? -1 : 0;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vpaddq_u32 (uint32x4_t a, uint32x4_t b)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzd_s64 (int64_t __a)
- {
--  uint32x4_t result;
--  __asm__ ("addp %0.4s,%1.4s,%2.4s"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return __a == 0 ? -1ll : 0ll;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vpaddq_u64 (uint64x2_t a, uint64x2_t b)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzd_u64 (uint64_t __a)
- {
--  uint64x2_t result;
--  __asm__ ("addp %0.2d,%1.2d,%2.2d"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return __a == 0 ? -1ll : 0ll;
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vpadds_f32 (float32x2_t a)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzd_f64 (float64_t __a)
- {
--  float32_t result;
--  __asm__ ("faddp %s0,%1.2s"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return __a == 0.0 ? -1ll : 0ll;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqdmulh_n_s16 (int16x4_t a, int16_t b)
-+/* vcge - vector.  */
-+
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcge_f32 (float32x2_t __a, float32x2_t __b)
- {
--  int16x4_t result;
--  __asm__ ("sqdmulh %0.4h,%1.4h,%2.h[0]"
--           : "=w"(result)
--           : "w"(a), "x"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint32x2_t) (__a >= __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqdmulh_n_s32 (int32x2_t a, int32_t b)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcge_f64 (float64x1_t __a, float64x1_t __b)
- {
--  int32x2_t result;
--  __asm__ ("sqdmulh %0.2s,%1.2s,%2.s[0]"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint64x1_t) (__a >= __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vqdmulhq_n_s16 (int16x8_t a, int16_t b)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcge_s8 (int8x8_t __a, int8x8_t __b)
- {
--  int16x8_t result;
--  __asm__ ("sqdmulh %0.8h,%1.8h,%2.h[0]"
--           : "=w"(result)
--           : "w"(a), "x"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint8x8_t) (__a >= __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmulhq_n_s32 (int32x4_t a, int32_t b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcge_s16 (int16x4_t __a, int16x4_t __b)
- {
--  int32x4_t result;
--  __asm__ ("sqdmulh %0.4s,%1.4s,%2.s[0]"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint16x4_t) (__a >= __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vqmovn_high_s16 (int8x8_t a, int16x8_t b)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcge_s32 (int32x2_t __a, int32x2_t __b)
- {
--  int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
--  __asm__ ("sqxtn2 %0.16b, %1.8h"
--           : "+w"(result)
--           : "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint32x2_t) (__a >= __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vqmovn_high_s32 (int16x4_t a, int32x4_t b)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcge_s64 (int64x1_t __a, int64x1_t __b)
- {
--  int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
--  __asm__ ("sqxtn2 %0.8h, %1.4s"
--           : "+w"(result)
--           : "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint64x1_t) (__a >= __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqmovn_high_s64 (int32x2_t a, int64x2_t b)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcge_u8 (uint8x8_t __a, uint8x8_t __b)
- {
--  int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
--  __asm__ ("sqxtn2 %0.4s, %1.2d"
--           : "+w"(result)
--           : "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (__a >= __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vqmovn_high_u16 (uint8x8_t a, uint16x8_t b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcge_u16 (uint16x4_t __a, uint16x4_t __b)
- {
--  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
--  __asm__ ("uqxtn2 %0.16b, %1.8h"
--           : "+w"(result)
--           : "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (__a >= __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vqmovn_high_u32 (uint16x4_t a, uint32x4_t b)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcge_u32 (uint32x2_t __a, uint32x2_t __b)
- {
--  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
--  __asm__ ("uqxtn2 %0.8h, %1.4s"
--           : "+w"(result)
--           : "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (__a >= __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vqmovn_high_u64 (uint32x2_t a, uint64x2_t b)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcge_u64 (uint64x1_t __a, uint64x1_t __b)
- {
--  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
--  __asm__ ("uqxtn2 %0.4s, %1.2d"
--           : "+w"(result)
--           : "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (__a >= __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vqmovun_high_s16 (uint8x8_t a, int16x8_t b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgeq_f32 (float32x4_t __a, float32x4_t __b)
- {
--  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
--  __asm__ ("sqxtun2 %0.16b, %1.8h"
--           : "+w"(result)
--           : "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint32x4_t) (__a >= __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vqmovun_high_s32 (uint16x4_t a, int32x4_t b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgeq_f64 (float64x2_t __a, float64x2_t __b)
- {
--  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
--  __asm__ ("sqxtun2 %0.8h, %1.4s"
--           : "+w"(result)
--           : "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint64x2_t) (__a >= __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vqmovun_high_s64 (uint32x2_t a, int64x2_t b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgeq_s8 (int8x16_t __a, int8x16_t __b)
- {
--  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
--  __asm__ ("sqxtun2 %0.4s, %1.2d"
--           : "+w"(result)
--           : "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint8x16_t) (__a >= __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqrdmulh_n_s16 (int16x4_t a, int16_t b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgeq_s16 (int16x8_t __a, int16x8_t __b)
- {
--  int16x4_t result;
--  __asm__ ("sqrdmulh %0.4h,%1.4h,%2.h[0]"
--           : "=w"(result)
--           : "w"(a), "x"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint16x8_t) (__a >= __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqrdmulh_n_s32 (int32x2_t a, int32_t b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgeq_s32 (int32x4_t __a, int32x4_t __b)
- {
--  int32x2_t result;
--  __asm__ ("sqrdmulh %0.2s,%1.2s,%2.s[0]"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint32x4_t) (__a >= __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vqrdmulhq_n_s16 (int16x8_t a, int16_t b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgeq_s64 (int64x2_t __a, int64x2_t __b)
- {
--  int16x8_t result;
--  __asm__ ("sqrdmulh %0.8h,%1.8h,%2.h[0]"
--           : "=w"(result)
--           : "w"(a), "x"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint64x2_t) (__a >= __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqrdmulhq_n_s32 (int32x4_t a, int32_t b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgeq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
--  int32x4_t result;
--  __asm__ ("sqrdmulh %0.4s,%1.4s,%2.s[0]"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (__a >= __b);
- }
- 
--#define vqrshrn_high_n_s16(a, b, c)                                     \
--  __extension__                                                         \
--    ({                                                                  \
--       int16x8_t b_ = (b);                                              \
--       int8x8_t a_ = (a);                                               \
--       int8x16_t result = vcombine_s8                                   \
--                            (a_, vcreate_s8                             \
--                                   (__AARCH64_UINT64_C (0x0)));         \
--       __asm__ ("sqrshrn2 %0.16b, %1.8h, #%2"                           \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgeq_u16 (uint16x8_t __a, uint16x8_t __b)
-+{
-+  return (__a >= __b);
-+}
- 
--#define vqrshrn_high_n_s32(a, b, c)                                     \
--  __extension__                                                         \
--    ({                                                                  \
--       int32x4_t b_ = (b);                                              \
--       int16x4_t a_ = (a);                                              \
--       int16x8_t result = vcombine_s16                                  \
--                            (a_, vcreate_s16                            \
--                                   (__AARCH64_UINT64_C (0x0)));         \
--       __asm__ ("sqrshrn2 %0.8h, %1.4s, #%2"                            \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgeq_u32 (uint32x4_t __a, uint32x4_t __b)
-+{
-+  return (__a >= __b);
-+}
- 
--#define vqrshrn_high_n_s64(a, b, c)                                     \
--  __extension__                                                         \
--    ({                                                                  \
--       int64x2_t b_ = (b);                                              \
--       int32x2_t a_ = (a);                                              \
--       int32x4_t result = vcombine_s32                                  \
--                            (a_, vcreate_s32                            \
--                                   (__AARCH64_UINT64_C (0x0)));         \
--       __asm__ ("sqrshrn2 %0.4s, %1.2d, #%2"                            \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgeq_u64 (uint64x2_t __a, uint64x2_t __b)
-+{
-+  return (__a >= __b);
-+}
- 
--#define vqrshrn_high_n_u16(a, b, c)                                     \
--  __extension__                                                         \
--    ({                                                                  \
--       uint16x8_t b_ = (b);                                             \
--       uint8x8_t a_ = (a);                                              \
--       uint8x16_t result = vcombine_u8                                  \
--                             (a_, vcreate_u8                            \
--                                    (__AARCH64_UINT64_C (0x0)));        \
--       __asm__ ("uqrshrn2 %0.16b, %1.8h, #%2"                           \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+/* vcge - scalar.  */
- 
--#define vqrshrn_high_n_u32(a, b, c)                                     \
--  __extension__                                                         \
--    ({                                                                  \
--       uint32x4_t b_ = (b);                                             \
--       uint16x4_t a_ = (a);                                             \
--       uint16x8_t result = vcombine_u16                                 \
--                             (a_, vcreate_u16                           \
--                                    (__AARCH64_UINT64_C (0x0)));        \
--       __asm__ ("uqrshrn2 %0.8h, %1.4s, #%2"                            \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcges_f32 (float32_t __a, float32_t __b)
-+{
-+  return __a >= __b ? -1 : 0;
-+}
- 
--#define vqrshrn_high_n_u64(a, b, c)                                     \
--  __extension__                                                         \
--    ({                                                                  \
--       uint64x2_t b_ = (b);                                             \
--       uint32x2_t a_ = (a);                                             \
--       uint32x4_t result = vcombine_u32                                 \
--                             (a_, vcreate_u32                           \
--                                    (__AARCH64_UINT64_C (0x0)));        \
--       __asm__ ("uqrshrn2 %0.4s, %1.2d, #%2"                            \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcged_s64 (int64_t __a, int64_t __b)
-+{
-+  return __a >= __b ? -1ll : 0ll;
-+}
- 
--#define vqrshrun_high_n_s16(a, b, c)                                    \
--  __extension__                                                         \
--    ({                                                                  \
--       int16x8_t b_ = (b);                                              \
--       uint8x8_t a_ = (a);                                              \
--       uint8x16_t result = vcombine_u8                                  \
--                             (a_, vcreate_u8                            \
--                                    (__AARCH64_UINT64_C (0x0)));        \
--       __asm__ ("sqrshrun2 %0.16b, %1.8h, #%2"                          \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcged_u64 (uint64_t __a, uint64_t __b)
-+{
-+  return __a >= __b ? -1ll : 0ll;
-+}
- 
--#define vqrshrun_high_n_s32(a, b, c)                                    \
--  __extension__                                                         \
--    ({                                                                  \
--       int32x4_t b_ = (b);                                              \
--       uint16x4_t a_ = (a);                                             \
--       uint16x8_t result = vcombine_u16                                 \
--                             (a_, vcreate_u16                           \
--                                    (__AARCH64_UINT64_C (0x0)));        \
--       __asm__ ("sqrshrun2 %0.8h, %1.4s, #%2"                           \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcged_f64 (float64_t __a, float64_t __b)
-+{
-+  return __a >= __b ? -1ll : 0ll;
-+}
- 
--#define vqrshrun_high_n_s64(a, b, c)                                    \
--  __extension__                                                         \
--    ({                                                                  \
--       int64x2_t b_ = (b);                                              \
--       uint32x2_t a_ = (a);                                             \
--       uint32x4_t result = vcombine_u32                                 \
--                             (a_, vcreate_u32                           \
--                                    (__AARCH64_UINT64_C (0x0)));        \
--       __asm__ ("sqrshrun2 %0.4s, %1.2d, #%2"                           \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+/* vcgez - vector.  */
- 
--#define vqshrn_high_n_s16(a, b, c)                                      \
--  __extension__                                                         \
--    ({                                                                  \
--       int16x8_t b_ = (b);                                              \
--       int8x8_t a_ = (a);                                               \
--       int8x16_t result = vcombine_s8                                   \
--                            (a_, vcreate_s8                             \
--                                   (__AARCH64_UINT64_C (0x0)));         \
--       __asm__ ("sqshrn2 %0.16b, %1.8h, #%2"                            \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgez_f32 (float32x2_t __a)
-+{
-+  return (uint32x2_t) (__a >= 0.0f);
-+}
- 
--#define vqshrn_high_n_s32(a, b, c)                                      \
--  __extension__                                                         \
--    ({                                                                  \
--       int32x4_t b_ = (b);                                              \
--       int16x4_t a_ = (a);                                              \
--       int16x8_t result = vcombine_s16                                  \
--                            (a_, vcreate_s16                            \
--                                   (__AARCH64_UINT64_C (0x0)));         \
--       __asm__ ("sqshrn2 %0.8h, %1.4s, #%2"                             \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgez_f64 (float64x1_t __a)
-+{
-+  return (uint64x1_t) (__a[0] >= (float64x1_t) {0.0});
-+}
- 
--#define vqshrn_high_n_s64(a, b, c)                                      \
--  __extension__                                                         \
--    ({                                                                  \
--       int64x2_t b_ = (b);                                              \
--       int32x2_t a_ = (a);                                              \
--       int32x4_t result = vcombine_s32                                  \
--                            (a_, vcreate_s32                            \
--                                   (__AARCH64_UINT64_C (0x0)));         \
--       __asm__ ("sqshrn2 %0.4s, %1.2d, #%2"                             \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgez_s8 (int8x8_t __a)
-+{
-+  return (uint8x8_t) (__a >= 0);
-+}
- 
--#define vqshrn_high_n_u16(a, b, c)                                      \
--  __extension__                                                         \
--    ({                                                                  \
--       uint16x8_t b_ = (b);                                             \
--       uint8x8_t a_ = (a);                                              \
--       uint8x16_t result = vcombine_u8                                  \
--                             (a_, vcreate_u8                            \
--                                    (__AARCH64_UINT64_C (0x0)));        \
--       __asm__ ("uqshrn2 %0.16b, %1.8h, #%2"                            \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgez_s16 (int16x4_t __a)
-+{
-+  return (uint16x4_t) (__a >= 0);
-+}
- 
--#define vqshrn_high_n_u32(a, b, c)                                      \
--  __extension__                                                         \
--    ({                                                                  \
--       uint32x4_t b_ = (b);                                             \
--       uint16x4_t a_ = (a);                                             \
--       uint16x8_t result = vcombine_u16                                 \
--                             (a_, vcreate_u16                           \
--                                    (__AARCH64_UINT64_C (0x0)));        \
--       __asm__ ("uqshrn2 %0.8h, %1.4s, #%2"                             \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgez_s32 (int32x2_t __a)
-+{
-+  return (uint32x2_t) (__a >= 0);
-+}
- 
--#define vqshrn_high_n_u64(a, b, c)                                      \
--  __extension__                                                         \
--    ({                                                                  \
--       uint64x2_t b_ = (b);                                             \
--       uint32x2_t a_ = (a);                                             \
--       uint32x4_t result = vcombine_u32                                 \
--                             (a_, vcreate_u32                           \
--                                    (__AARCH64_UINT64_C (0x0)));        \
--       __asm__ ("uqshrn2 %0.4s, %1.2d, #%2"                             \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgez_s64 (int64x1_t __a)
-+{
-+  return (uint64x1_t) (__a >= __AARCH64_INT64_C (0));
-+}
- 
--#define vqshrun_high_n_s16(a, b, c)                                     \
--  __extension__                                                         \
--    ({                                                                  \
--       int16x8_t b_ = (b);                                              \
--       uint8x8_t a_ = (a);                                              \
--       uint8x16_t result = vcombine_u8                                  \
--                             (a_, vcreate_u8                            \
--                                    (__AARCH64_UINT64_C (0x0)));        \
--       __asm__ ("sqshrun2 %0.16b, %1.8h, #%2"                           \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgezq_f32 (float32x4_t __a)
-+{
-+  return (uint32x4_t) (__a >= 0.0f);
-+}
-+
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgezq_f64 (float64x2_t __a)
-+{
-+  return (uint64x2_t) (__a >= 0.0);
-+}
-+
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgezq_s8 (int8x16_t __a)
-+{
-+  return (uint8x16_t) (__a >= 0);
-+}
- 
--#define vqshrun_high_n_s32(a, b, c)                                     \
--  __extension__                                                         \
--    ({                                                                  \
--       int32x4_t b_ = (b);                                              \
--       uint16x4_t a_ = (a);                                             \
--       uint16x8_t result = vcombine_u16                                 \
--                             (a_, vcreate_u16                           \
--                                    (__AARCH64_UINT64_C (0x0)));        \
--       __asm__ ("sqshrun2 %0.8h, %1.4s, #%2"                            \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgezq_s16 (int16x8_t __a)
-+{
-+  return (uint16x8_t) (__a >= 0);
-+}
- 
--#define vqshrun_high_n_s64(a, b, c)                                     \
--  __extension__                                                         \
--    ({                                                                  \
--       int64x2_t b_ = (b);                                              \
--       uint32x2_t a_ = (a);                                             \
--       uint32x4_t result = vcombine_u32                                 \
--                             (a_, vcreate_u32                           \
--                                    (__AARCH64_UINT64_C (0x0)));        \
--       __asm__ ("sqshrun2 %0.4s, %1.2d, #%2"                            \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgezq_s32 (int32x4_t __a)
-+{
-+  return (uint32x4_t) (__a >= 0);
-+}
- 
--#define vrshrn_high_n_s16(a, b, c)                                      \
--  __extension__                                                         \
--    ({                                                                  \
--       int16x8_t b_ = (b);                                              \
--       int8x8_t a_ = (a);                                               \
--       int8x16_t result = vcombine_s8                                   \
--                            (a_, vcreate_s8                             \
--                                   (__AARCH64_UINT64_C (0x0)));         \
--       __asm__ ("rshrn2 %0.16b,%1.8h,#%2"                               \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgezq_s64 (int64x2_t __a)
-+{
-+  return (uint64x2_t) (__a >= __AARCH64_INT64_C (0));
-+}
- 
--#define vrshrn_high_n_s32(a, b, c)                                      \
--  __extension__                                                         \
--    ({                                                                  \
--       int32x4_t b_ = (b);                                              \
--       int16x4_t a_ = (a);                                              \
--       int16x8_t result = vcombine_s16                                  \
--                            (a_, vcreate_s16                            \
--                                   (__AARCH64_UINT64_C (0x0)));         \
--       __asm__ ("rshrn2 %0.8h,%1.4s,#%2"                                \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+/* vcgez - scalar.  */
- 
--#define vrshrn_high_n_s64(a, b, c)                                      \
--  __extension__                                                         \
--    ({                                                                  \
--       int64x2_t b_ = (b);                                              \
--       int32x2_t a_ = (a);                                              \
--       int32x4_t result = vcombine_s32                                  \
--                            (a_, vcreate_s32                            \
--                                   (__AARCH64_UINT64_C (0x0)));         \
--       __asm__ ("rshrn2 %0.4s,%1.2d,#%2"                                \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgezs_f32 (float32_t __a)
-+{
-+  return __a >= 0.0f ? -1 : 0;
-+}
- 
--#define vrshrn_high_n_u16(a, b, c)                                      \
--  __extension__                                                         \
--    ({                                                                  \
--       uint16x8_t b_ = (b);                                             \
--       uint8x8_t a_ = (a);                                              \
--       uint8x16_t result = vcombine_u8                                  \
--                            (a_, vcreate_u8                             \
--                                   (__AARCH64_UINT64_C (0x0)));         \
--       __asm__ ("rshrn2 %0.16b,%1.8h,#%2"                               \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgezd_s64 (int64_t __a)
-+{
-+  return __a >= 0 ? -1ll : 0ll;
-+}
- 
--#define vrshrn_high_n_u32(a, b, c)                                      \
--  __extension__                                                         \
--    ({                                                                  \
--       uint32x4_t b_ = (b);                                             \
--       uint16x4_t a_ = (a);                                             \
--       uint16x8_t result = vcombine_u16                                 \
--                            (a_, vcreate_u16                            \
--                                   (__AARCH64_UINT64_C (0x0)));         \
--       __asm__ ("rshrn2 %0.8h,%1.4s,#%2"                                \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgezd_f64 (float64_t __a)
-+{
-+  return __a >= 0.0 ? -1ll : 0ll;
-+}
- 
--#define vrshrn_high_n_u64(a, b, c)                                      \
--  __extension__                                                         \
--    ({                                                                  \
--       uint64x2_t b_ = (b);                                             \
--       uint32x2_t a_ = (a);                                             \
--       uint32x4_t result = vcombine_u32                                 \
--                            (a_, vcreate_u32                            \
--                                   (__AARCH64_UINT64_C (0x0)));         \
--       __asm__ ("rshrn2 %0.4s,%1.2d,#%2"                                \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+/* vcgt - vector.  */
- 
--#define vrshrn_n_s16(a, b)                                              \
--  __extension__                                                         \
--    ({                                                                  \
--       int16x8_t a_ = (a);                                              \
--       int8x8_t result;                                                 \
--       __asm__ ("rshrn %0.8b,%1.8h,%2"                                  \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgt_f32 (float32x2_t __a, float32x2_t __b)
-+{
-+  return (uint32x2_t) (__a > __b);
-+}
- 
--#define vrshrn_n_s32(a, b)                                              \
--  __extension__                                                         \
--    ({                                                                  \
--       int32x4_t a_ = (a);                                              \
--       int16x4_t result;                                                \
--       __asm__ ("rshrn %0.4h,%1.4s,%2"                                  \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgt_f64 (float64x1_t __a, float64x1_t __b)
-+{
-+  return (uint64x1_t) (__a > __b);
-+}
- 
--#define vrshrn_n_s64(a, b)                                              \
--  __extension__                                                         \
--    ({                                                                  \
--       int64x2_t a_ = (a);                                              \
--       int32x2_t result;                                                \
--       __asm__ ("rshrn %0.2s,%1.2d,%2"                                  \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgt_s8 (int8x8_t __a, int8x8_t __b)
-+{
-+  return (uint8x8_t) (__a > __b);
-+}
- 
--#define vrshrn_n_u16(a, b)                                              \
--  __extension__                                                         \
--    ({                                                                  \
--       uint16x8_t a_ = (a);                                             \
--       uint8x8_t result;                                                \
--       __asm__ ("rshrn %0.8b,%1.8h,%2"                                  \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgt_s16 (int16x4_t __a, int16x4_t __b)
-+{
-+  return (uint16x4_t) (__a > __b);
-+}
- 
--#define vrshrn_n_u32(a, b)                                              \
--  __extension__                                                         \
--    ({                                                                  \
--       uint32x4_t a_ = (a);                                             \
--       uint16x4_t result;                                               \
--       __asm__ ("rshrn %0.4h,%1.4s,%2"                                  \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgt_s32 (int32x2_t __a, int32x2_t __b)
-+{
-+  return (uint32x2_t) (__a > __b);
-+}
- 
--#define vrshrn_n_u64(a, b)                                              \
--  __extension__                                                         \
--    ({                                                                  \
--       uint64x2_t a_ = (a);                                             \
--       uint32x2_t result;                                               \
--       __asm__ ("rshrn %0.2s,%1.2d,%2"                                  \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgt_s64 (int64x1_t __a, int64x1_t __b)
-+{
-+  return (uint64x1_t) (__a > __b);
-+}
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vrsqrte_f32 (float32x2_t a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgt_u8 (uint8x8_t __a, uint8x8_t __b)
- {
--  float32x2_t result;
--  __asm__ ("frsqrte %0.2s,%1.2s"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return (__a > __b);
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vrsqrte_f64 (float64x1_t a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgt_u16 (uint16x4_t __a, uint16x4_t __b)
- {
--  float64x1_t result;
--  __asm__ ("frsqrte %d0,%d1"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return (__a > __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vrsqrte_u32 (uint32x2_t a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgt_u32 (uint32x2_t __a, uint32x2_t __b)
- {
--  uint32x2_t result;
--  __asm__ ("ursqrte %0.2s,%1.2s"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return (__a > __b);
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vrsqrted_f64 (float64_t a)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgt_u64 (uint64x1_t __a, uint64x1_t __b)
- {
--  float64_t result;
--  __asm__ ("frsqrte %d0,%d1"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return (__a > __b);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vrsqrteq_f32 (float32x4_t a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtq_f32 (float32x4_t __a, float32x4_t __b)
- {
--  float32x4_t result;
--  __asm__ ("frsqrte %0.4s,%1.4s"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return (uint32x4_t) (__a > __b);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vrsqrteq_f64 (float64x2_t a)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtq_f64 (float64x2_t __a, float64x2_t __b)
- {
--  float64x2_t result;
--  __asm__ ("frsqrte %0.2d,%1.2d"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return (uint64x2_t) (__a > __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vrsqrteq_u32 (uint32x4_t a)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtq_s8 (int8x16_t __a, int8x16_t __b)
- {
--  uint32x4_t result;
--  __asm__ ("ursqrte %0.4s,%1.4s"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return (uint8x16_t) (__a > __b);
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vrsqrtes_f32 (float32_t a)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtq_s16 (int16x8_t __a, int16x8_t __b)
- {
--  float32_t result;
--  __asm__ ("frsqrte %s0,%s1"
--           : "=w"(result)
--           : "w"(a)
--           : /* No clobbers */);
--  return result;
-+  return (uint16x8_t) (__a > __b);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vrsqrts_f32 (float32x2_t a, float32x2_t b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtq_s32 (int32x4_t __a, int32x4_t __b)
- {
--  float32x2_t result;
--  __asm__ ("frsqrts %0.2s,%1.2s,%2.2s"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint32x4_t) (__a > __b);
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vrsqrtsd_f64 (float64_t a, float64_t b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtq_s64 (int64x2_t __a, int64x2_t __b)
- {
--  float64_t result;
--  __asm__ ("frsqrts %d0,%d1,%d2"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint64x2_t) (__a > __b);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vrsqrtsq_f32 (float32x4_t a, float32x4_t b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
--  float32x4_t result;
--  __asm__ ("frsqrts %0.4s,%1.4s,%2.4s"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (__a > __b);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vrsqrtsq_f64 (float64x2_t a, float64x2_t b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
--  float64x2_t result;
--  __asm__ ("frsqrts %0.2d,%1.2d,%2.2d"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (__a > __b);
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vrsqrtss_f32 (float32_t a, float32_t b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
--  float32_t result;
--  __asm__ ("frsqrts %s0,%s1,%s2"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (__a > __b);
- }
- 
--#define vshrn_high_n_s16(a, b, c)                                       \
--  __extension__                                                         \
--    ({                                                                  \
--       int16x8_t b_ = (b);                                              \
--       int8x8_t a_ = (a);                                               \
--       int8x16_t result = vcombine_s8                                   \
--                            (a_, vcreate_s8                             \
--                                   (__AARCH64_UINT64_C (0x0)));         \
--       __asm__ ("shrn2 %0.16b,%1.8h,#%2"                                \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
--
--#define vshrn_high_n_s32(a, b, c)                                       \
--  __extension__                                                         \
--    ({                                                                  \
--       int32x4_t b_ = (b);                                              \
--       int16x4_t a_ = (a);                                              \
--       int16x8_t result = vcombine_s16                                  \
--                            (a_, vcreate_s16                            \
--                                   (__AARCH64_UINT64_C (0x0)));         \
--       __asm__ ("shrn2 %0.8h,%1.4s,#%2"                                 \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
--
--#define vshrn_high_n_s64(a, b, c)                                       \
--  __extension__                                                         \
--    ({                                                                  \
--       int64x2_t b_ = (b);                                              \
--       int32x2_t a_ = (a);                                              \
--       int32x4_t result = vcombine_s32                                  \
--                            (a_, vcreate_s32                            \
--                                   (__AARCH64_UINT64_C (0x0)));         \
--       __asm__ ("shrn2 %0.4s,%1.2d,#%2"                                 \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
--
--#define vshrn_high_n_u16(a, b, c)                                       \
--  __extension__                                                         \
--    ({                                                                  \
--       uint16x8_t b_ = (b);                                             \
--       uint8x8_t a_ = (a);                                              \
--       uint8x16_t result = vcombine_u8                                  \
--                            (a_, vcreate_u8                             \
--                                   (__AARCH64_UINT64_C (0x0)));         \
--       __asm__ ("shrn2 %0.16b,%1.8h,#%2"                                \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
--
--#define vshrn_high_n_u32(a, b, c)                                       \
--  __extension__                                                         \
--    ({                                                                  \
--       uint32x4_t b_ = (b);                                             \
--       uint16x4_t a_ = (a);                                             \
--       uint16x8_t result = vcombine_u16                                 \
--                            (a_, vcreate_u16                            \
--                                   (__AARCH64_UINT64_C (0x0)));         \
--       __asm__ ("shrn2 %0.8h,%1.4s,#%2"                                 \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
--
--#define vshrn_high_n_u64(a, b, c)                                       \
--  __extension__                                                         \
--    ({                                                                  \
--       uint64x2_t b_ = (b);                                             \
--       uint32x2_t a_ = (a);                                             \
--       uint32x4_t result = vcombine_u32                                 \
--                            (a_, vcreate_u32                            \
--                                   (__AARCH64_UINT64_C (0x0)));         \
--       __asm__ ("shrn2 %0.4s,%1.2d,#%2"                                 \
--                : "+w"(result)                                          \
--                : "w"(b_), "i"(c)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtq_u64 (uint64x2_t __a, uint64x2_t __b)
-+{
-+  return (__a > __b);
-+}
- 
--#define vshrn_n_s16(a, b)                                               \
--  __extension__                                                         \
--    ({                                                                  \
--       int16x8_t a_ = (a);                                              \
--       int8x8_t result;                                                 \
--       __asm__ ("shrn %0.8b,%1.8h,%2"                                   \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+/* vcgt - scalar.  */
- 
--#define vshrn_n_s32(a, b)                                               \
--  __extension__                                                         \
--    ({                                                                  \
--       int32x4_t a_ = (a);                                              \
--       int16x4_t result;                                                \
--       __asm__ ("shrn %0.4h,%1.4s,%2"                                   \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgts_f32 (float32_t __a, float32_t __b)
-+{
-+  return __a > __b ? -1 : 0;
-+}
- 
--#define vshrn_n_s64(a, b)                                               \
--  __extension__                                                         \
--    ({                                                                  \
--       int64x2_t a_ = (a);                                              \
--       int32x2_t result;                                                \
--       __asm__ ("shrn %0.2s,%1.2d,%2"                                   \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtd_s64 (int64_t __a, int64_t __b)
-+{
-+  return __a > __b ? -1ll : 0ll;
-+}
- 
--#define vshrn_n_u16(a, b)                                               \
--  __extension__                                                         \
--    ({                                                                  \
--       uint16x8_t a_ = (a);                                             \
--       uint8x8_t result;                                                \
--       __asm__ ("shrn %0.8b,%1.8h,%2"                                   \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtd_u64 (uint64_t __a, uint64_t __b)
-+{
-+  return __a > __b ? -1ll : 0ll;
-+}
- 
--#define vshrn_n_u32(a, b)                                               \
--  __extension__                                                         \
--    ({                                                                  \
--       uint32x4_t a_ = (a);                                             \
--       uint16x4_t result;                                               \
--       __asm__ ("shrn %0.4h,%1.4s,%2"                                   \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtd_f64 (float64_t __a, float64_t __b)
-+{
-+  return __a > __b ? -1ll : 0ll;
-+}
- 
--#define vshrn_n_u64(a, b)                                               \
--  __extension__                                                         \
--    ({                                                                  \
--       uint64x2_t a_ = (a);                                             \
--       uint32x2_t result;                                               \
--       __asm__ ("shrn %0.2s,%1.2d,%2"                                   \
--                : "=w"(result)                                          \
--                : "w"(a_), "i"(b)                                       \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+/* vcgtz - vector.  */
- 
--#define vsli_n_p8(a, b, c)                                              \
--  __extension__                                                         \
--    ({                                                                  \
--       poly8x8_t b_ = (b);                                              \
--       poly8x8_t a_ = (a);                                              \
--       poly8x8_t result;                                                \
--       __asm__ ("sli %0.8b,%2.8b,%3"                                    \
--                : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "i"(c)                              \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtz_f32 (float32x2_t __a)
-+{
-+  return (uint32x2_t) (__a > 0.0f);
-+}
- 
--#define vsli_n_p16(a, b, c)                                             \
--  __extension__                                                         \
--    ({                                                                  \
--       poly16x4_t b_ = (b);                                             \
--       poly16x4_t a_ = (a);                                             \
--       poly16x4_t result;                                               \
--       __asm__ ("sli %0.4h,%2.4h,%3"                                    \
--                : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "i"(c)                              \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtz_f64 (float64x1_t __a)
-+{
-+  return (uint64x1_t) (__a > (float64x1_t) {0.0});
-+}
- 
--#define vsliq_n_p8(a, b, c)                                             \
--  __extension__                                                         \
--    ({                                                                  \
--       poly8x16_t b_ = (b);                                             \
--       poly8x16_t a_ = (a);                                             \
--       poly8x16_t result;                                               \
--       __asm__ ("sli %0.16b,%2.16b,%3"                                  \
--                : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "i"(c)                              \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtz_s8 (int8x8_t __a)
-+{
-+  return (uint8x8_t) (__a > 0);
-+}
- 
--#define vsliq_n_p16(a, b, c)                                            \
--  __extension__                                                         \
--    ({                                                                  \
--       poly16x8_t b_ = (b);                                             \
--       poly16x8_t a_ = (a);                                             \
--       poly16x8_t result;                                               \
--       __asm__ ("sli %0.8h,%2.8h,%3"                                    \
--                : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "i"(c)                              \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtz_s16 (int16x4_t __a)
-+{
-+  return (uint16x4_t) (__a > 0);
-+}
- 
--#define vsri_n_p8(a, b, c)                                              \
--  __extension__                                                         \
--    ({                                                                  \
--       poly8x8_t b_ = (b);                                              \
--       poly8x8_t a_ = (a);                                              \
--       poly8x8_t result;                                                \
--       __asm__ ("sri %0.8b,%2.8b,%3"                                    \
--                : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "i"(c)                              \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtz_s32 (int32x2_t __a)
-+{
-+  return (uint32x2_t) (__a > 0);
-+}
- 
--#define vsri_n_p16(a, b, c)                                             \
--  __extension__                                                         \
--    ({                                                                  \
--       poly16x4_t b_ = (b);                                             \
--       poly16x4_t a_ = (a);                                             \
--       poly16x4_t result;                                               \
--       __asm__ ("sri %0.4h,%2.4h,%3"                                    \
--                : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "i"(c)                              \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtz_s64 (int64x1_t __a)
-+{
-+  return (uint64x1_t) (__a > __AARCH64_INT64_C (0));
-+}
- 
--#define vsriq_n_p8(a, b, c)                                             \
--  __extension__                                                         \
--    ({                                                                  \
--       poly8x16_t b_ = (b);                                             \
--       poly8x16_t a_ = (a);                                             \
--       poly8x16_t result;                                               \
--       __asm__ ("sri %0.16b,%2.16b,%3"                                  \
--                : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "i"(c)                              \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtzq_f32 (float32x4_t __a)
-+{
-+  return (uint32x4_t) (__a > 0.0f);
-+}
- 
--#define vsriq_n_p16(a, b, c)                                            \
--  __extension__                                                         \
--    ({                                                                  \
--       poly16x8_t b_ = (b);                                             \
--       poly16x8_t a_ = (a);                                             \
--       poly16x8_t result;                                               \
--       __asm__ ("sri %0.8h,%2.8h,%3"                                    \
--                : "=w"(result)                                          \
--                : "0"(a_), "w"(b_), "i"(c)                              \
--                : /* No clobbers */);                                   \
--       result;                                                          \
--     })
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtzq_f64 (float64x2_t __a)
-+{
-+    return (uint64x2_t) (__a > 0.0);
-+}
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vtst_p8 (poly8x8_t a, poly8x8_t b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtzq_s8 (int8x16_t __a)
- {
--  uint8x8_t result;
--  __asm__ ("cmtst %0.8b, %1.8b, %2.8b"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint8x16_t) (__a > 0);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vtst_p16 (poly16x4_t a, poly16x4_t b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtzq_s16 (int16x8_t __a)
- {
--  uint16x4_t result;
--  __asm__ ("cmtst %0.4h, %1.4h, %2.4h"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint16x8_t) (__a > 0);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vtstq_p8 (poly8x16_t a, poly8x16_t b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtzq_s32 (int32x4_t __a)
- {
--  uint8x16_t result;
--  __asm__ ("cmtst %0.16b, %1.16b, %2.16b"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint32x4_t) (__a > 0);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vtstq_p16 (poly16x8_t a, poly16x8_t b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtzq_s64 (int64x2_t __a)
- {
--  uint16x8_t result;
--  __asm__ ("cmtst %0.8h, %1.8h, %2.8h"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint64x2_t) (__a > __AARCH64_INT64_C (0));
- }
- 
--/* End of temporary inline asm implementations.  */
-+/* vcgtz - scalar.  */
- 
--/* Start of temporary inline asm for vldn, vstn and friends.  */
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtzs_f32 (float32_t __a)
-+{
-+  return __a > 0.0f ? -1 : 0;
-+}
- 
--/* Create struct element types for duplicating loads.
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtzd_s64 (int64_t __a)
-+{
-+  return __a > 0 ? -1ll : 0ll;
-+}
- 
--   Create 2 element structures of:
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtzd_f64 (float64_t __a)
-+{
-+  return __a > 0.0 ? -1ll : 0ll;
-+}
- 
--   +------+----+----+----+----+
--   |      | 8  | 16 | 32 | 64 |
--   +------+----+----+----+----+
--   |int   | Y  | Y  | N  | N  |
--   +------+----+----+----+----+
--   |uint  | Y  | Y  | N  | N  |
--   +------+----+----+----+----+
--   |float | -  | Y  | N  | N  |
--   +------+----+----+----+----+
--   |poly  | Y  | Y  | -  | -  |
--   +------+----+----+----+----+
-+/* vcle - vector.  */
- 
--   Create 3 element structures of:
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcle_f32 (float32x2_t __a, float32x2_t __b)
-+{
-+  return (uint32x2_t) (__a <= __b);
-+}
- 
--   +------+----+----+----+----+
--   |      | 8  | 16 | 32 | 64 |
--   +------+----+----+----+----+
--   |int   | Y  | Y  | Y  | Y  |
--   +------+----+----+----+----+
--   |uint  | Y  | Y  | Y  | Y  |
--   +------+----+----+----+----+
--   |float | -  | Y  | Y  | Y  |
--   +------+----+----+----+----+
--   |poly  | Y  | Y  | -  | -  |
--   +------+----+----+----+----+
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcle_f64 (float64x1_t __a, float64x1_t __b)
-+{
-+  return (uint64x1_t) (__a <= __b);
-+}
- 
--   Create 4 element structures of:
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcle_s8 (int8x8_t __a, int8x8_t __b)
-+{
-+  return (uint8x8_t) (__a <= __b);
-+}
- 
--   +------+----+----+----+----+
--   |      | 8  | 16 | 32 | 64 |
--   +------+----+----+----+----+
--   |int   | Y  | N  | N  | Y  |
--   +------+----+----+----+----+
--   |uint  | Y  | N  | N  | Y  |
--   +------+----+----+----+----+
--   |float | -  | N  | N  | Y  |
--   +------+----+----+----+----+
--   |poly  | Y  | N  | -  | -  |
--   +------+----+----+----+----+
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcle_s16 (int16x4_t __a, int16x4_t __b)
-+{
-+  return (uint16x4_t) (__a <= __b);
-+}
- 
--  This is required for casting memory reference.  */
--#define __STRUCTN(t, sz, nelem)			\
--  typedef struct t ## sz ## x ## nelem ## _t {	\
--    t ## sz ## _t val[nelem];			\
--  }  t ## sz ## x ## nelem ## _t;
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcle_s32 (int32x2_t __a, int32x2_t __b)
-+{
-+  return (uint32x2_t) (__a <= __b);
-+}
- 
--/* 2-element structs.  */
--__STRUCTN (int, 8, 2)
--__STRUCTN (int, 16, 2)
--__STRUCTN (uint, 8, 2)
--__STRUCTN (uint, 16, 2)
--__STRUCTN (float, 16, 2)
--__STRUCTN (poly, 8, 2)
--__STRUCTN (poly, 16, 2)
--/* 3-element structs.  */
--__STRUCTN (int, 8, 3)
--__STRUCTN (int, 16, 3)
--__STRUCTN (int, 32, 3)
--__STRUCTN (int, 64, 3)
--__STRUCTN (uint, 8, 3)
--__STRUCTN (uint, 16, 3)
--__STRUCTN (uint, 32, 3)
--__STRUCTN (uint, 64, 3)
--__STRUCTN (float, 16, 3)
--__STRUCTN (float, 32, 3)
--__STRUCTN (float, 64, 3)
--__STRUCTN (poly, 8, 3)
--__STRUCTN (poly, 16, 3)
--/* 4-element structs.  */
--__STRUCTN (int, 8, 4)
--__STRUCTN (int, 64, 4)
--__STRUCTN (uint, 8, 4)
--__STRUCTN (uint, 64, 4)
--__STRUCTN (poly, 8, 4)
--__STRUCTN (float, 64, 4)
--#undef __STRUCTN
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcle_s64 (int64x1_t __a, int64x1_t __b)
-+{
-+  return (uint64x1_t) (__a <= __b);
-+}
- 
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcle_u8 (uint8x8_t __a, uint8x8_t __b)
-+{
-+  return (__a <= __b);
-+}
- 
--#define __ST2_LANE_FUNC(intype, largetype, ptrtype, mode,		     \
--			qmode, ptr_mode, funcsuffix, signedtype)	     \
--__extension__ static __inline void					     \
--__attribute__ ((__always_inline__))					     \
--vst2_lane_ ## funcsuffix (ptrtype *__ptr,				     \
--			  intype __b, const int __c)			     \
--{									     \
--  __builtin_aarch64_simd_oi __o;					     \
--  largetype __temp;							     \
--  __temp.val[0]								     \
--    = vcombine_##funcsuffix (__b.val[0],				     \
--			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
--  __temp.val[1]								     \
--    = vcombine_##funcsuffix (__b.val[1],				     \
--			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
--  __o = __builtin_aarch64_set_qregoi##qmode (__o,			     \
--					     (signedtype) __temp.val[0], 0); \
--  __o = __builtin_aarch64_set_qregoi##qmode (__o,			     \
--					     (signedtype) __temp.val[1], 1); \
--  __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *)  \
--				     __ptr, __o, __c);			     \
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcle_u16 (uint16x4_t __a, uint16x4_t __b)
-+{
-+  return (__a <= __b);
- }
- 
--__ST2_LANE_FUNC (float16x4x2_t, float16x8x2_t, float16_t, v4hf, v8hf, hf, f16,
--		 float16x8_t)
--__ST2_LANE_FUNC (float32x2x2_t, float32x4x2_t, float32_t, v2sf, v4sf, sf, f32,
--		 float32x4_t)
--__ST2_LANE_FUNC (float64x1x2_t, float64x2x2_t, float64_t, df, v2df, df, f64,
--		 float64x2_t)
--__ST2_LANE_FUNC (poly8x8x2_t, poly8x16x2_t, poly8_t, v8qi, v16qi, qi, p8,
--		 int8x16_t)
--__ST2_LANE_FUNC (poly16x4x2_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi, p16,
--		 int16x8_t)
--__ST2_LANE_FUNC (int8x8x2_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8,
--		 int8x16_t)
--__ST2_LANE_FUNC (int16x4x2_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16,
--		 int16x8_t)
--__ST2_LANE_FUNC (int32x2x2_t, int32x4x2_t, int32_t, v2si, v4si, si, s32,
--		 int32x4_t)
--__ST2_LANE_FUNC (int64x1x2_t, int64x2x2_t, int64_t, di, v2di, di, s64,
--		 int64x2_t)
--__ST2_LANE_FUNC (uint8x8x2_t, uint8x16x2_t, uint8_t, v8qi, v16qi, qi, u8,
--		 int8x16_t)
--__ST2_LANE_FUNC (uint16x4x2_t, uint16x8x2_t, uint16_t, v4hi, v8hi, hi, u16,
--		 int16x8_t)
--__ST2_LANE_FUNC (uint32x2x2_t, uint32x4x2_t, uint32_t, v2si, v4si, si, u32,
--		 int32x4_t)
--__ST2_LANE_FUNC (uint64x1x2_t, uint64x2x2_t, uint64_t, di, v2di, di, u64,
--		 int64x2_t)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcle_u32 (uint32x2_t __a, uint32x2_t __b)
-+{
-+  return (__a <= __b);
-+}
- 
--#undef __ST2_LANE_FUNC
--#define __ST2_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix)	    \
--__extension__ static __inline void					    \
--__attribute__ ((__always_inline__))					    \
--vst2q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
--			   intype __b, const int __c)			    \
--{									    \
--  union { intype __i;							    \
--	  __builtin_aarch64_simd_oi __o; } __temp = { __b };		    \
--  __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
--				    __ptr, __temp.__o, __c);		    \
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcle_u64 (uint64x1_t __a, uint64x1_t __b)
-+{
-+  return (__a <= __b);
- }
- 
--__ST2_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16)
--__ST2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
--__ST2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
--__ST2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
--__ST2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
--__ST2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
--__ST2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
--__ST2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
--__ST2_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64)
--__ST2_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8)
--__ST2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16)
--__ST2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32)
--__ST2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcleq_f32 (float32x4_t __a, float32x4_t __b)
-+{
-+  return (uint32x4_t) (__a <= __b);
-+}
- 
--#define __ST3_LANE_FUNC(intype, largetype, ptrtype, mode,		     \
--			qmode, ptr_mode, funcsuffix, signedtype)	     \
--__extension__ static __inline void					     \
--__attribute__ ((__always_inline__))					     \
--vst3_lane_ ## funcsuffix (ptrtype *__ptr,				     \
--			  intype __b, const int __c)			     \
--{									     \
--  __builtin_aarch64_simd_ci __o;					     \
--  largetype __temp;							     \
--  __temp.val[0]								     \
--    = vcombine_##funcsuffix (__b.val[0],				     \
--			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
--  __temp.val[1]								     \
--    = vcombine_##funcsuffix (__b.val[1],				     \
--			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
--  __temp.val[2]								     \
--    = vcombine_##funcsuffix (__b.val[2],				     \
--			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
--  __o = __builtin_aarch64_set_qregci##qmode (__o,			     \
--					     (signedtype) __temp.val[0], 0); \
--  __o = __builtin_aarch64_set_qregci##qmode (__o,			     \
--					     (signedtype) __temp.val[1], 1); \
--  __o = __builtin_aarch64_set_qregci##qmode (__o,			     \
--					     (signedtype) __temp.val[2], 2); \
--  __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *)  \
--				     __ptr, __o, __c);			     \
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcleq_f64 (float64x2_t __a, float64x2_t __b)
-+{
-+  return (uint64x2_t) (__a <= __b);
- }
- 
--__ST3_LANE_FUNC (float16x4x3_t, float16x8x3_t, float16_t, v4hf, v8hf, hf, f16,
--		 float16x8_t)
--__ST3_LANE_FUNC (float32x2x3_t, float32x4x3_t, float32_t, v2sf, v4sf, sf, f32,
--		 float32x4_t)
--__ST3_LANE_FUNC (float64x1x3_t, float64x2x3_t, float64_t, df, v2df, df, f64,
--		 float64x2_t)
--__ST3_LANE_FUNC (poly8x8x3_t, poly8x16x3_t, poly8_t, v8qi, v16qi, qi, p8,
--		 int8x16_t)
--__ST3_LANE_FUNC (poly16x4x3_t, poly16x8x3_t, poly16_t, v4hi, v8hi, hi, p16,
--		 int16x8_t)
--__ST3_LANE_FUNC (int8x8x3_t, int8x16x3_t, int8_t, v8qi, v16qi, qi, s8,
--		 int8x16_t)
--__ST3_LANE_FUNC (int16x4x3_t, int16x8x3_t, int16_t, v4hi, v8hi, hi, s16,
--		 int16x8_t)
--__ST3_LANE_FUNC (int32x2x3_t, int32x4x3_t, int32_t, v2si, v4si, si, s32,
--		 int32x4_t)
--__ST3_LANE_FUNC (int64x1x3_t, int64x2x3_t, int64_t, di, v2di, di, s64,
--		 int64x2_t)
--__ST3_LANE_FUNC (uint8x8x3_t, uint8x16x3_t, uint8_t, v8qi, v16qi, qi, u8,
--		 int8x16_t)
--__ST3_LANE_FUNC (uint16x4x3_t, uint16x8x3_t, uint16_t, v4hi, v8hi, hi, u16,
--		 int16x8_t)
--__ST3_LANE_FUNC (uint32x2x3_t, uint32x4x3_t, uint32_t, v2si, v4si, si, u32,
--		 int32x4_t)
--__ST3_LANE_FUNC (uint64x1x3_t, uint64x2x3_t, uint64_t, di, v2di, di, u64,
--		 int64x2_t)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcleq_s8 (int8x16_t __a, int8x16_t __b)
-+{
-+  return (uint8x16_t) (__a <= __b);
-+}
- 
--#undef __ST3_LANE_FUNC
--#define __ST3_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix)	    \
--__extension__ static __inline void					    \
--__attribute__ ((__always_inline__))					    \
--vst3q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
--			   intype __b, const int __c)			    \
--{									    \
--  union { intype __i;							    \
--	  __builtin_aarch64_simd_ci __o; } __temp = { __b };		    \
--  __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
--				    __ptr, __temp.__o, __c);		    \
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcleq_s16 (int16x8_t __a, int16x8_t __b)
-+{
-+  return (uint16x8_t) (__a <= __b);
- }
- 
--__ST3_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16)
--__ST3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
--__ST3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
--__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
--__ST3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16)
--__ST3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8)
--__ST3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16)
--__ST3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32)
--__ST3_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64)
--__ST3_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8)
--__ST3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16)
--__ST3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32)
--__ST3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcleq_s32 (int32x4_t __a, int32x4_t __b)
-+{
-+  return (uint32x4_t) (__a <= __b);
-+}
- 
--#define __ST4_LANE_FUNC(intype, largetype, ptrtype, mode,		     \
--			qmode, ptr_mode, funcsuffix, signedtype)	     \
--__extension__ static __inline void					     \
--__attribute__ ((__always_inline__))					     \
--vst4_lane_ ## funcsuffix (ptrtype *__ptr,				     \
--			  intype __b, const int __c)			     \
--{									     \
--  __builtin_aarch64_simd_xi __o;					     \
--  largetype __temp;							     \
--  __temp.val[0]								     \
--    = vcombine_##funcsuffix (__b.val[0],				     \
--			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
--  __temp.val[1]								     \
--    = vcombine_##funcsuffix (__b.val[1],				     \
--			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
--  __temp.val[2]								     \
--    = vcombine_##funcsuffix (__b.val[2],				     \
--			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
--  __temp.val[3]								     \
--    = vcombine_##funcsuffix (__b.val[3],				     \
--			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
--  __o = __builtin_aarch64_set_qregxi##qmode (__o,			     \
--					     (signedtype) __temp.val[0], 0); \
--  __o = __builtin_aarch64_set_qregxi##qmode (__o,			     \
--					     (signedtype) __temp.val[1], 1); \
--  __o = __builtin_aarch64_set_qregxi##qmode (__o,			     \
--					     (signedtype) __temp.val[2], 2); \
--  __o = __builtin_aarch64_set_qregxi##qmode (__o,			     \
--					     (signedtype) __temp.val[3], 3); \
--  __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *)  \
--				     __ptr, __o, __c);			     \
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcleq_s64 (int64x2_t __a, int64x2_t __b)
-+{
-+  return (uint64x2_t) (__a <= __b);
- }
- 
--__ST4_LANE_FUNC (float16x4x4_t, float16x8x4_t, float16_t, v4hf, v8hf, hf, f16,
--		 float16x8_t)
--__ST4_LANE_FUNC (float32x2x4_t, float32x4x4_t, float32_t, v2sf, v4sf, sf, f32,
--		 float32x4_t)
--__ST4_LANE_FUNC (float64x1x4_t, float64x2x4_t, float64_t, df, v2df, df, f64,
--		 float64x2_t)
--__ST4_LANE_FUNC (poly8x8x4_t, poly8x16x4_t, poly8_t, v8qi, v16qi, qi, p8,
--		 int8x16_t)
--__ST4_LANE_FUNC (poly16x4x4_t, poly16x8x4_t, poly16_t, v4hi, v8hi, hi, p16,
--		 int16x8_t)
--__ST4_LANE_FUNC (int8x8x4_t, int8x16x4_t, int8_t, v8qi, v16qi, qi, s8,
--		 int8x16_t)
--__ST4_LANE_FUNC (int16x4x4_t, int16x8x4_t, int16_t, v4hi, v8hi, hi, s16,
--		 int16x8_t)
--__ST4_LANE_FUNC (int32x2x4_t, int32x4x4_t, int32_t, v2si, v4si, si, s32,
--		 int32x4_t)
--__ST4_LANE_FUNC (int64x1x4_t, int64x2x4_t, int64_t, di, v2di, di, s64,
--		 int64x2_t)
--__ST4_LANE_FUNC (uint8x8x4_t, uint8x16x4_t, uint8_t, v8qi, v16qi, qi, u8,
--		 int8x16_t)
--__ST4_LANE_FUNC (uint16x4x4_t, uint16x8x4_t, uint16_t, v4hi, v8hi, hi, u16,
--		 int16x8_t)
--__ST4_LANE_FUNC (uint32x2x4_t, uint32x4x4_t, uint32_t, v2si, v4si, si, u32,
--		 int32x4_t)
--__ST4_LANE_FUNC (uint64x1x4_t, uint64x2x4_t, uint64_t, di, v2di, di, u64,
--		 int64x2_t)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcleq_u8 (uint8x16_t __a, uint8x16_t __b)
-+{
-+  return (__a <= __b);
-+}
- 
--#undef __ST4_LANE_FUNC
--#define __ST4_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix)	    \
--__extension__ static __inline void					    \
--__attribute__ ((__always_inline__))					    \
--vst4q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
--			   intype __b, const int __c)			    \
--{									    \
--  union { intype __i;							    \
--	  __builtin_aarch64_simd_xi __o; } __temp = { __b };		    \
--  __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
--				    __ptr, __temp.__o, __c);		    \
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcleq_u16 (uint16x8_t __a, uint16x8_t __b)
-+{
-+  return (__a <= __b);
-+}
-+
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcleq_u32 (uint32x4_t __a, uint32x4_t __b)
-+{
-+  return (__a <= __b);
-+}
-+
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcleq_u64 (uint64x2_t __a, uint64x2_t __b)
-+{
-+  return (__a <= __b);
- }
- 
--__ST4_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16)
--__ST4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
--__ST4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
--__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
--__ST4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16)
--__ST4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8)
--__ST4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16)
--__ST4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32)
--__ST4_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64)
--__ST4_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8)
--__ST4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16)
--__ST4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32)
--__ST4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64)
-+/* vcle - scalar.  */
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vaddlv_s32 (int32x2_t a)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcles_f32 (float32_t __a, float32_t __b)
- {
--  int64_t result;
--  __asm__ ("saddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : );
--  return result;
-+  return __a <= __b ? -1 : 0;
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vaddlv_u32 (uint32x2_t a)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcled_s64 (int64_t __a, int64_t __b)
- {
--  uint64_t result;
--  __asm__ ("uaddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : );
--  return result;
-+  return __a <= __b ? -1ll : 0ll;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqdmulh_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcled_u64 (uint64_t __a, uint64_t __b)
- {
--  return __builtin_aarch64_sqdmulh_laneqv4hi (__a, __b, __c);
-+  return __a <= __b ? -1ll : 0ll;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqdmulh_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcled_f64 (float64_t __a, float64_t __b)
- {
--  return __builtin_aarch64_sqdmulh_laneqv2si (__a, __b, __c);
-+  return __a <= __b ? -1ll : 0ll;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vqdmulhq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
-+/* vclez - vector.  */
-+
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclez_f32 (float32x2_t __a)
- {
--  return __builtin_aarch64_sqdmulh_laneqv8hi (__a, __b, __c);
-+  return (uint32x2_t) (__a <= 0.0f);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclez_f64 (float64x1_t __a)
- {
--  return __builtin_aarch64_sqdmulh_laneqv4si (__a, __b, __c);
-+  return (uint64x1_t) (__a <= (float64x1_t) {0.0});
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqrdmulh_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclez_s8 (int8x8_t __a)
- {
--  return  __builtin_aarch64_sqrdmulh_laneqv4hi (__a, __b, __c);
-+  return (uint8x8_t) (__a <= 0);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqrdmulh_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclez_s16 (int16x4_t __a)
- {
--  return __builtin_aarch64_sqrdmulh_laneqv2si (__a, __b, __c);
-+  return (uint16x4_t) (__a <= 0);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vqrdmulhq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclez_s32 (int32x2_t __a)
- {
--  return __builtin_aarch64_sqrdmulh_laneqv8hi (__a, __b, __c);
-+  return (uint32x2_t) (__a <= 0);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqrdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclez_s64 (int64x1_t __a)
- {
--  return __builtin_aarch64_sqrdmulh_laneqv4si (__a, __b, __c);
-+  return (uint64x1_t) (__a <= __AARCH64_INT64_C (0));
- }
- 
--/* Table intrinsics.  */
--
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vqtbl1_p8 (poly8x16_t a, uint8x8_t b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclezq_f32 (float32x4_t __a)
- {
--  poly8x8_t result;
--  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint32x4_t) (__a <= 0.0f);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vqtbl1_s8 (int8x16_t a, uint8x8_t b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclezq_f64 (float64x2_t __a)
- {
--  int8x8_t result;
--  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint64x2_t) (__a <= 0.0);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqtbl1_u8 (uint8x16_t a, uint8x8_t b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclezq_s8 (int8x16_t __a)
- {
--  uint8x8_t result;
--  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint8x16_t) (__a <= 0);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vqtbl1q_p8 (poly8x16_t a, uint8x16_t b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclezq_s16 (int16x8_t __a)
- {
--  poly8x16_t result;
--  __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint16x8_t) (__a <= 0);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vqtbl1q_s8 (int8x16_t a, uint8x16_t b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclezq_s32 (int32x4_t __a)
- {
--  int8x16_t result;
--  __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint32x4_t) (__a <= 0);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vqtbl1q_u8 (uint8x16_t a, uint8x16_t b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclezq_s64 (int64x2_t __a)
- {
--  uint8x16_t result;
--  __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
--           : "=w"(result)
--           : "w"(a), "w"(b)
--           : /* No clobbers */);
--  return result;
-+  return (uint64x2_t) (__a <= __AARCH64_INT64_C (0));
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vqtbx1_s8 (int8x8_t r, int8x16_t tab, uint8x8_t idx)
-+/* vclez - scalar.  */
-+
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclezs_f32 (float32_t __a)
- {
--  int8x8_t result = r;
--  __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
--           : "+w"(result)
--           : "w"(tab), "w"(idx)
--           : /* No clobbers */);
--  return result;
-+  return __a <= 0.0f ? -1 : 0;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqtbx1_u8 (uint8x8_t r, uint8x16_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclezd_s64 (int64_t __a)
- {
--  uint8x8_t result = r;
--  __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
--           : "+w"(result)
--           : "w"(tab), "w"(idx)
--           : /* No clobbers */);
--  return result;
-+  return __a <= 0 ? -1ll : 0ll;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vqtbx1_p8 (poly8x8_t r, poly8x16_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclezd_f64 (float64_t __a)
- {
--  poly8x8_t result = r;
--  __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
--           : "+w"(result)
--           : "w"(tab), "w"(idx)
--           : /* No clobbers */);
--  return result;
-+  return __a <= 0.0 ? -1ll : 0ll;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vqtbx1q_s8 (int8x16_t r, int8x16_t tab, uint8x16_t idx)
-+/* vclt - vector.  */
-+
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclt_f32 (float32x2_t __a, float32x2_t __b)
- {
--  int8x16_t result = r;
--  __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
--           : "+w"(result)
--           : "w"(tab), "w"(idx)
--           : /* No clobbers */);
--  return result;
-+  return (uint32x2_t) (__a < __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vqtbx1q_u8 (uint8x16_t r, uint8x16_t tab, uint8x16_t idx)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclt_f64 (float64x1_t __a, float64x1_t __b)
- {
--  uint8x16_t result = r;
--  __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
--           : "+w"(result)
--           : "w"(tab), "w"(idx)
--           : /* No clobbers */);
--  return result;
-+  return (uint64x1_t) (__a < __b);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vqtbx1q_p8 (poly8x16_t r, poly8x16_t tab, uint8x16_t idx)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclt_s8 (int8x8_t __a, int8x8_t __b)
- {
--  poly8x16_t result = r;
--  __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
--           : "+w"(result)
--           : "w"(tab), "w"(idx)
--           : /* No clobbers */);
--  return result;
-+  return (uint8x8_t) (__a < __b);
- }
- 
--/* V7 legacy table intrinsics.  */
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclt_s16 (int16x4_t __a, int16x4_t __b)
-+{
-+  return (uint16x4_t) (__a < __b);
-+}
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vtbl1_s8 (int8x8_t tab, int8x8_t idx)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclt_s32 (int32x2_t __a, int32x2_t __b)
- {
--  int8x8_t result;
--  int8x16_t temp = vcombine_s8 (tab, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
--  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
--           : "=w"(result)
--           : "w"(temp), "w"(idx)
--           : /* No clobbers */);
--  return result;
-+  return (uint32x2_t) (__a < __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vtbl1_u8 (uint8x8_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclt_s64 (int64x1_t __a, int64x1_t __b)
- {
--  uint8x8_t result;
--  uint8x16_t temp = vcombine_u8 (tab, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
--  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
--           : "=w"(result)
--           : "w"(temp), "w"(idx)
--           : /* No clobbers */);
--  return result;
-+  return (uint64x1_t) (__a < __b);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vtbl1_p8 (poly8x8_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclt_u8 (uint8x8_t __a, uint8x8_t __b)
- {
--  poly8x8_t result;
--  poly8x16_t temp = vcombine_p8 (tab, vcreate_p8 (__AARCH64_UINT64_C (0x0)));
--  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
--           : "=w"(result)
--           : "w"(temp), "w"(idx)
--           : /* No clobbers */);
--  return result;
-+  return (__a < __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vtbl2_s8 (int8x8x2_t tab, int8x8_t idx)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclt_u16 (uint16x4_t __a, uint16x4_t __b)
- {
--  int8x8_t result;
--  int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]);
--  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
--           : "=w"(result)
--           : "w"(temp), "w"(idx)
--           : /* No clobbers */);
--  return result;
-+  return (__a < __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vtbl2_u8 (uint8x8x2_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclt_u32 (uint32x2_t __a, uint32x2_t __b)
- {
--  uint8x8_t result;
--  uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]);
--  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
--           : "=w"(result)
--           : "w"(temp), "w"(idx)
--           : /* No clobbers */);
--  return result;
-+  return (__a < __b);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vtbl2_p8 (poly8x8x2_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclt_u64 (uint64x1_t __a, uint64x1_t __b)
- {
--  poly8x8_t result;
--  poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]);
--  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
--           : "=w"(result)
--           : "w"(temp), "w"(idx)
--           : /* No clobbers */);
--  return result;
-+  return (__a < __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vtbl3_s8 (int8x8x3_t tab, int8x8_t idx)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltq_f32 (float32x4_t __a, float32x4_t __b)
- {
--  int8x8_t result;
--  int8x16x2_t temp;
--  __builtin_aarch64_simd_oi __o;
--  temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
--  temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0)));
--  __o = __builtin_aarch64_set_qregoiv16qi (__o,
--					   (int8x16_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o,
--					   (int8x16_t) temp.val[1], 1);
--  result = __builtin_aarch64_tbl3v8qi (__o, idx);
--  return result;
-+  return (uint32x4_t) (__a < __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vtbl3_u8 (uint8x8x3_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltq_f64 (float64x2_t __a, float64x2_t __b)
- {
--  uint8x8_t result;
--  uint8x16x2_t temp;
--  __builtin_aarch64_simd_oi __o;
--  temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
--  temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0)));
--  __o = __builtin_aarch64_set_qregoiv16qi (__o,
--					   (int8x16_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o,
--					   (int8x16_t) temp.val[1], 1);
--  result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
--  return result;
-+  return (uint64x2_t) (__a < __b);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vtbl3_p8 (poly8x8x3_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltq_s8 (int8x16_t __a, int8x16_t __b)
- {
--  poly8x8_t result;
--  poly8x16x2_t temp;
--  __builtin_aarch64_simd_oi __o;
--  temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
--  temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0)));
--  __o = __builtin_aarch64_set_qregoiv16qi (__o,
--					   (int8x16_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o,
--					   (int8x16_t) temp.val[1], 1);
--  result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
--  return result;
-+  return (uint8x16_t) (__a < __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vtbl4_s8 (int8x8x4_t tab, int8x8_t idx)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltq_s16 (int16x8_t __a, int16x8_t __b)
- {
--  int8x8_t result;
--  int8x16x2_t temp;
--  __builtin_aarch64_simd_oi __o;
--  temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
--  temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o,
--					   (int8x16_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o,
--					   (int8x16_t) temp.val[1], 1);
--  result = __builtin_aarch64_tbl3v8qi (__o, idx);
--  return result;
-+  return (uint16x8_t) (__a < __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vtbl4_u8 (uint8x8x4_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltq_s32 (int32x4_t __a, int32x4_t __b)
- {
--  uint8x8_t result;
--  uint8x16x2_t temp;
--  __builtin_aarch64_simd_oi __o;
--  temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
--  temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o,
--					   (int8x16_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o,
--					   (int8x16_t) temp.val[1], 1);
--  result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
--  return result;
-+  return (uint32x4_t) (__a < __b);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vtbl4_p8 (poly8x8x4_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltq_s64 (int64x2_t __a, int64x2_t __b)
- {
--  poly8x8_t result;
--  poly8x16x2_t temp;
--  __builtin_aarch64_simd_oi __o;
--  temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
--  temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o,
--					   (int8x16_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o,
--					   (int8x16_t) temp.val[1], 1);
--  result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
--  return result;
-+  return (uint64x2_t) (__a < __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vtbx2_s8 (int8x8_t r, int8x8x2_t tab, int8x8_t idx)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
--  int8x8_t result = r;
--  int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]);
--  __asm__ ("tbx %0.8b, {%1.16b}, %2.8b"
--           : "+w"(result)
--           : "w"(temp), "w"(idx)
--           : /* No clobbers */);
--  return result;
-+  return (__a < __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vtbx2_u8 (uint8x8_t r, uint8x8x2_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
--  uint8x8_t result = r;
--  uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]);
--  __asm__ ("tbx %0.8b, {%1.16b}, %2.8b"
--           : "+w"(result)
--           : "w"(temp), "w"(idx)
--           : /* No clobbers */);
--  return result;
-+  return (__a < __b);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
--  poly8x8_t result = r;
--  poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]);
--  __asm__ ("tbx %0.8b, {%1.16b}, %2.8b"
--           : "+w"(result)
--           : "w"(temp), "w"(idx)
--           : /* No clobbers */);
--  return result;
-+  return (__a < __b);
- }
- 
--/* End of temporary inline asm.  */
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltq_u64 (uint64x2_t __a, uint64x2_t __b)
-+{
-+  return (__a < __b);
-+}
- 
--/* Start of optimal implementations in approved order.  */
-+/* vclt - scalar.  */
- 
--/* vabs  */
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclts_f32 (float32_t __a, float32_t __b)
-+{
-+  return __a < __b ? -1 : 0;
-+}
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vabs_f32 (float32x2_t __a)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltd_s64 (int64_t __a, int64_t __b)
- {
--  return __builtin_aarch64_absv2sf (__a);
-+  return __a < __b ? -1ll : 0ll;
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vabs_f64 (float64x1_t __a)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltd_u64 (uint64_t __a, uint64_t __b)
- {
--  return (float64x1_t) {__builtin_fabs (__a[0])};
-+  return __a < __b ? -1ll : 0ll;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vabs_s8 (int8x8_t __a)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltd_f64 (float64_t __a, float64_t __b)
- {
--  return __builtin_aarch64_absv8qi (__a);
-+  return __a < __b ? -1ll : 0ll;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vabs_s16 (int16x4_t __a)
-+/* vcltz - vector.  */
-+
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltz_f32 (float32x2_t __a)
- {
--  return __builtin_aarch64_absv4hi (__a);
-+  return (uint32x2_t) (__a < 0.0f);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vabs_s32 (int32x2_t __a)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltz_f64 (float64x1_t __a)
- {
--  return __builtin_aarch64_absv2si (__a);
-+  return (uint64x1_t) (__a < (float64x1_t) {0.0});
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vabs_s64 (int64x1_t __a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltz_s8 (int8x8_t __a)
- {
--  return (int64x1_t) {__builtin_aarch64_absdi (__a[0])};
-+  return (uint8x8_t) (__a < 0);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vabsq_f32 (float32x4_t __a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltz_s16 (int16x4_t __a)
- {
--  return __builtin_aarch64_absv4sf (__a);
-+  return (uint16x4_t) (__a < 0);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vabsq_f64 (float64x2_t __a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltz_s32 (int32x2_t __a)
- {
--  return __builtin_aarch64_absv2df (__a);
-+  return (uint32x2_t) (__a < 0);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vabsq_s8 (int8x16_t __a)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltz_s64 (int64x1_t __a)
- {
--  return __builtin_aarch64_absv16qi (__a);
-+  return (uint64x1_t) (__a < __AARCH64_INT64_C (0));
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vabsq_s16 (int16x8_t __a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltzq_f32 (float32x4_t __a)
- {
--  return __builtin_aarch64_absv8hi (__a);
-+  return (uint32x4_t) (__a < 0.0f);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vabsq_s32 (int32x4_t __a)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltzq_f64 (float64x2_t __a)
- {
--  return __builtin_aarch64_absv4si (__a);
-+  return (uint64x2_t) (__a < 0.0);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vabsq_s64 (int64x2_t __a)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltzq_s8 (int8x16_t __a)
- {
--  return __builtin_aarch64_absv2di (__a);
-+  return (uint8x16_t) (__a < 0);
- }
- 
--/* vadd */
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltzq_s16 (int16x8_t __a)
-+{
-+  return (uint16x8_t) (__a < 0);
-+}
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vaddd_s64 (int64_t __a, int64_t __b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltzq_s32 (int32x4_t __a)
- {
--  return __a + __b;
-+  return (uint32x4_t) (__a < 0);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vaddd_u64 (uint64_t __a, uint64_t __b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltzq_s64 (int64x2_t __a)
- {
--  return __a + __b;
-+  return (uint64x2_t) (__a < __AARCH64_INT64_C (0));
- }
- 
--/* vaddv */
-+/* vcltz - scalar.  */
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vaddv_s8 (int8x8_t __a)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltzs_f32 (float32_t __a)
- {
--  return __builtin_aarch64_reduc_plus_scal_v8qi (__a);
-+  return __a < 0.0f ? -1 : 0;
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vaddv_s16 (int16x4_t __a)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltzd_s64 (int64_t __a)
- {
--  return __builtin_aarch64_reduc_plus_scal_v4hi (__a);
-+  return __a < 0 ? -1ll : 0ll;
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vaddv_s32 (int32x2_t __a)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltzd_f64 (float64_t __a)
- {
--  return __builtin_aarch64_reduc_plus_scal_v2si (__a);
-+  return __a < 0.0 ? -1ll : 0ll;
- }
- 
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vaddv_u8 (uint8x8_t __a)
-+/* vcls.  */
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcls_s8 (int8x8_t __a)
- {
--  return (uint8_t) __builtin_aarch64_reduc_plus_scal_v8qi ((int8x8_t) __a);
-+  return __builtin_aarch64_clrsbv8qi (__a);
- }
- 
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vaddv_u16 (uint16x4_t __a)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcls_s16 (int16x4_t __a)
- {
--  return (uint16_t) __builtin_aarch64_reduc_plus_scal_v4hi ((int16x4_t) __a);
-+  return __builtin_aarch64_clrsbv4hi (__a);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vaddv_u32 (uint32x2_t __a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcls_s32 (int32x2_t __a)
- {
--  return (int32_t) __builtin_aarch64_reduc_plus_scal_v2si ((int32x2_t) __a);
-+  return __builtin_aarch64_clrsbv2si (__a);
- }
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vaddvq_s8 (int8x16_t __a)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclsq_s8 (int8x16_t __a)
- {
--  return __builtin_aarch64_reduc_plus_scal_v16qi (__a);
-+  return __builtin_aarch64_clrsbv16qi (__a);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vaddvq_s16 (int16x8_t __a)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclsq_s16 (int16x8_t __a)
- {
--  return __builtin_aarch64_reduc_plus_scal_v8hi (__a);
-+  return __builtin_aarch64_clrsbv8hi (__a);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vaddvq_s32 (int32x4_t __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclsq_s32 (int32x4_t __a)
- {
--  return __builtin_aarch64_reduc_plus_scal_v4si (__a);
-+  return __builtin_aarch64_clrsbv4si (__a);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vaddvq_s64 (int64x2_t __a)
-+/* vclz.  */
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclz_s8 (int8x8_t __a)
- {
--  return __builtin_aarch64_reduc_plus_scal_v2di (__a);
-+  return __builtin_aarch64_clzv8qi (__a);
- }
- 
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vaddvq_u8 (uint8x16_t __a)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclz_s16 (int16x4_t __a)
- {
--  return (uint8_t) __builtin_aarch64_reduc_plus_scal_v16qi ((int8x16_t) __a);
-+  return __builtin_aarch64_clzv4hi (__a);
- }
- 
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vaddvq_u16 (uint16x8_t __a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclz_s32 (int32x2_t __a)
- {
--  return (uint16_t) __builtin_aarch64_reduc_plus_scal_v8hi ((int16x8_t) __a);
-+  return __builtin_aarch64_clzv2si (__a);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vaddvq_u32 (uint32x4_t __a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclz_u8 (uint8x8_t __a)
- {
--  return (uint32_t) __builtin_aarch64_reduc_plus_scal_v4si ((int32x4_t) __a);
-+  return (uint8x8_t)__builtin_aarch64_clzv8qi ((int8x8_t)__a);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vaddvq_u64 (uint64x2_t __a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclz_u16 (uint16x4_t __a)
- {
--  return (uint64_t) __builtin_aarch64_reduc_plus_scal_v2di ((int64x2_t) __a);
-+  return (uint16x4_t)__builtin_aarch64_clzv4hi ((int16x4_t)__a);
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vaddv_f32 (float32x2_t __a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclz_u32 (uint32x2_t __a)
- {
--  return __builtin_aarch64_reduc_plus_scal_v2sf (__a);
-+  return (uint32x2_t)__builtin_aarch64_clzv2si ((int32x2_t)__a);
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vaddvq_f32 (float32x4_t __a)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclzq_s8 (int8x16_t __a)
- {
--  return __builtin_aarch64_reduc_plus_scal_v4sf (__a);
-+  return __builtin_aarch64_clzv16qi (__a);
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vaddvq_f64 (float64x2_t __a)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclzq_s16 (int16x8_t __a)
- {
--  return __builtin_aarch64_reduc_plus_scal_v2df (__a);
-+  return __builtin_aarch64_clzv8hi (__a);
- }
- 
--/* vbsl  */
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclzq_s32 (int32x4_t __a)
-+{
-+  return __builtin_aarch64_clzv4si (__a);
-+}
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vbsl_f32 (uint32x2_t __a, float32x2_t __b, float32x2_t __c)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclzq_u8 (uint8x16_t __a)
- {
--  return __builtin_aarch64_simd_bslv2sf_suss (__a, __b, __c);
-+  return (uint8x16_t)__builtin_aarch64_clzv16qi ((int8x16_t)__a);
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vbsl_f64 (uint64x1_t __a, float64x1_t __b, float64x1_t __c)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclzq_u16 (uint16x8_t __a)
- {
--  return (float64x1_t)
--    { __builtin_aarch64_simd_bsldf_suss (__a[0], __b[0], __c[0]) };
-+  return (uint16x8_t)__builtin_aarch64_clzv8hi ((int16x8_t)__a);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vbsl_p8 (uint8x8_t __a, poly8x8_t __b, poly8x8_t __c)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclzq_u32 (uint32x4_t __a)
- {
--  return __builtin_aarch64_simd_bslv8qi_pupp (__a, __b, __c);
-+  return (uint32x4_t)__builtin_aarch64_clzv4si ((int32x4_t)__a);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
--vbsl_p16 (uint16x4_t __a, poly16x4_t __b, poly16x4_t __c)
-+/* vcnt.  */
-+
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcnt_p8 (poly8x8_t __a)
- {
--  return __builtin_aarch64_simd_bslv4hi_pupp (__a, __b, __c);
-+  return (poly8x8_t) __builtin_aarch64_popcountv8qi ((int8x8_t) __a);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vbsl_s8 (uint8x8_t __a, int8x8_t __b, int8x8_t __c)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcnt_s8 (int8x8_t __a)
- {
--  return __builtin_aarch64_simd_bslv8qi_suss (__a, __b, __c);
-+  return __builtin_aarch64_popcountv8qi (__a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vbsl_s16 (uint16x4_t __a, int16x4_t __b, int16x4_t __c)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcnt_u8 (uint8x8_t __a)
- {
--  return __builtin_aarch64_simd_bslv4hi_suss (__a, __b, __c);
-+  return (uint8x8_t) __builtin_aarch64_popcountv8qi ((int8x8_t) __a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vbsl_s32 (uint32x2_t __a, int32x2_t __b, int32x2_t __c)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcntq_p8 (poly8x16_t __a)
- {
--  return __builtin_aarch64_simd_bslv2si_suss (__a, __b, __c);
-+  return (poly8x16_t) __builtin_aarch64_popcountv16qi ((int8x16_t) __a);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vbsl_s64 (uint64x1_t __a, int64x1_t __b, int64x1_t __c)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcntq_s8 (int8x16_t __a)
- {
--  return (int64x1_t)
--      {__builtin_aarch64_simd_bsldi_suss (__a[0], __b[0], __c[0])};
-+  return __builtin_aarch64_popcountv16qi (__a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vbsl_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcntq_u8 (uint8x16_t __a)
- {
--  return __builtin_aarch64_simd_bslv8qi_uuuu (__a, __b, __c);
-+  return (uint8x16_t) __builtin_aarch64_popcountv16qi ((int8x16_t) __a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vbsl_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
-+/* vcopy_lane.  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_f32 (float32x2_t __a, const int __lane1,
-+		float32x2_t __b, const int __lane2)
- {
--  return __builtin_aarch64_simd_bslv4hi_uuuu (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vbsl_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_f64 (float64x1_t __a, const int __lane1,
-+		float64x1_t __b, const int __lane2)
- {
--  return __builtin_aarch64_simd_bslv2si_uuuu (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vbsl_u64 (uint64x1_t __a, uint64x1_t __b, uint64x1_t __c)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_p8 (poly8x8_t __a, const int __lane1,
-+	       poly8x8_t __b, const int __lane2)
- {
--  return (uint64x1_t)
--      {__builtin_aarch64_simd_bsldi_uuuu (__a[0], __b[0], __c[0])};
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				 __a, __lane1);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vbslq_f32 (uint32x4_t __a, float32x4_t __b, float32x4_t __c)
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_p16 (poly16x4_t __a, const int __lane1,
-+		poly16x4_t __b, const int __lane2)
- {
--  return __builtin_aarch64_simd_bslv4sf_suss (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vbslq_f64 (uint64x2_t __a, float64x2_t __b, float64x2_t __c)
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_p64 (poly64x1_t __a, const int __lane1,
-+		poly64x1_t __b, const int __lane2)
- {
--  return __builtin_aarch64_simd_bslv2df_suss (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vbslq_p8 (uint8x16_t __a, poly8x16_t __b, poly8x16_t __c)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_s8 (int8x8_t __a, const int __lane1,
-+	       int8x8_t __b, const int __lane2)
- {
--  return __builtin_aarch64_simd_bslv16qi_pupp (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				 __a, __lane1);
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vbslq_p16 (uint16x8_t __a, poly16x8_t __b, poly16x8_t __c)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_s16 (int16x4_t __a, const int __lane1,
-+		int16x4_t __b, const int __lane2)
- {
--  return __builtin_aarch64_simd_bslv8hi_pupp (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vbslq_s8 (uint8x16_t __a, int8x16_t __b, int8x16_t __c)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_s32 (int32x2_t __a, const int __lane1,
-+		int32x2_t __b, const int __lane2)
- {
--  return __builtin_aarch64_simd_bslv16qi_suss (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vbslq_s16 (uint16x8_t __a, int16x8_t __b, int16x8_t __c)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_s64 (int64x1_t __a, const int __lane1,
-+		int64x1_t __b, const int __lane2)
- {
--  return __builtin_aarch64_simd_bslv8hi_suss (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vbslq_s32 (uint32x4_t __a, int32x4_t __b, int32x4_t __c)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_u8 (uint8x8_t __a, const int __lane1,
-+	       uint8x8_t __b, const int __lane2)
- {
--  return __builtin_aarch64_simd_bslv4si_suss (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				 __a, __lane1);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vbslq_s64 (uint64x2_t __a, int64x2_t __b, int64x2_t __c)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_u16 (uint16x4_t __a, const int __lane1,
-+		uint16x4_t __b, const int __lane2)
- {
--  return __builtin_aarch64_simd_bslv2di_suss (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vbslq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_u32 (uint32x2_t __a, const int __lane1,
-+		uint32x2_t __b, const int __lane2)
- {
--  return __builtin_aarch64_simd_bslv16qi_uuuu (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vbslq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_lane_u64 (uint64x1_t __a, const int __lane1,
-+		uint64x1_t __b, const int __lane2)
- {
--  return __builtin_aarch64_simd_bslv8hi_uuuu (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vbslq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
-+/* vcopy_laneq.  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_f32 (float32x2_t __a, const int __lane1,
-+		 float32x4_t __b, const int __lane2)
- {
--  return __builtin_aarch64_simd_bslv4si_uuuu (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vbslq_u64 (uint64x2_t __a, uint64x2_t __b, uint64x2_t __c)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_f64 (float64x1_t __a, const int __lane1,
-+		 float64x2_t __b, const int __lane2)
- {
--  return __builtin_aarch64_simd_bslv2di_uuuu (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--/* ARMv8.1 instrinsics.  */
--#pragma GCC push_options
--#pragma GCC target ("arch=armv8.1-a")
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_p8 (poly8x8_t __a, const int __lane1,
-+		poly8x16_t __b, const int __lane2)
-+{
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				 __a, __lane1);
-+}
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqrdmlah_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_p16 (poly16x4_t __a, const int __lane1,
-+		 poly16x8_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlahv4hi (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqrdmlah_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_p64 (poly64x1_t __a, const int __lane1,
-+		 poly64x2_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlahv2si (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vqrdmlahq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_s8 (int8x8_t __a, const int __lane1,
-+		int8x16_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlahv8hi (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				 __a, __lane1);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqrdmlahq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_s16 (int16x4_t __a, const int __lane1,
-+		 int16x8_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlahv4si (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqrdmlsh_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_s32 (int32x2_t __a, const int __lane1,
-+		 int32x4_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlshv4hi (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqrdmlsh_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_s64 (int64x1_t __a, const int __lane1,
-+		 int64x2_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlshv2si (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vqrdmlshq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_u8 (uint8x8_t __a, const int __lane1,
-+		uint8x16_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlshv8hi (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				 __a, __lane1);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqrdmlshq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_u16 (uint16x4_t __a, const int __lane1,
-+		 uint16x8_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlshv4si (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqrdmlah_laneq_s16 (int16x4_t __a, int16x4_t __b, int16x8_t __c, const int __d)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_u32 (uint32x2_t __a, const int __lane1,
-+		 uint32x4_t __b, const int __lane2)
- {
--  return  __builtin_aarch64_sqrdmlah_laneqv4hi (__a, __b, __c, __d);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqrdmlah_laneq_s32 (int32x2_t __a, int32x2_t __b, int32x4_t __c, const int __d)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopy_laneq_u64 (uint64x1_t __a, const int __lane1,
-+		 uint64x2_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlah_laneqv2si (__a, __b, __c, __d);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vqrdmlahq_laneq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c, const int __d)
-+/* vcopyq_lane.  */
-+
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_f32 (float32x4_t __a, const int __lane1,
-+		 float32x2_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlah_laneqv8hi (__a, __b, __c, __d);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqrdmlahq_laneq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c, const int __d)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_f64 (float64x2_t __a, const int __lane1,
-+		 float64x1_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlah_laneqv4si (__a, __b, __c, __d);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqrdmlsh_laneq_s16 (int16x4_t __a, int16x4_t __b, int16x8_t __c, const int __d)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_p8 (poly8x16_t __a, const int __lane1,
-+		poly8x8_t __b, const int __lane2)
- {
--  return  __builtin_aarch64_sqrdmlsh_laneqv4hi (__a, __b, __c, __d);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqrdmlsh_laneq_s32 (int32x2_t __a, int32x2_t __b, int32x4_t __c, const int __d)
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_p16 (poly16x8_t __a, const int __lane1,
-+		 poly16x4_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlsh_laneqv2si (__a, __b, __c, __d);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vqrdmlshq_laneq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c, const int __d)
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_p64 (poly64x2_t __a, const int __lane1,
-+		 poly64x1_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlsh_laneqv8hi (__a, __b, __c, __d);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqrdmlshq_laneq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c, const int __d)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_s8 (int8x16_t __a, const int __lane1,
-+		int8x8_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlsh_laneqv4si (__a, __b, __c, __d);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqrdmlah_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_s16 (int16x8_t __a, const int __lane1,
-+		 int16x4_t __b, const int __lane2)
- {
--  return  __builtin_aarch64_sqrdmlah_lanev4hi (__a, __b, __c, __d);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqrdmlah_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_s32 (int32x4_t __a, const int __lane1,
-+		 int32x2_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlah_lanev2si (__a, __b, __c, __d);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vqrdmlahq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_s64 (int64x2_t __a, const int __lane1,
-+		 int64x1_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlah_lanev8hi (__a, __b, __c, __d);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqrdmlahq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_u8 (uint8x16_t __a, const int __lane1,
-+		uint8x8_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlah_lanev4si (__a, __b, __c, __d);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqrdmlahh_s16 (int16_t __a, int16_t __b, int16_t __c)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_u16 (uint16x8_t __a, const int __lane1,
-+		 uint16x4_t __b, const int __lane2)
- {
--  return (int16_t) __builtin_aarch64_sqrdmlahhi (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqrdmlahh_lane_s16 (int16_t __a, int16_t __b, int16x4_t __c, const int __d)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_u32 (uint32x4_t __a, const int __lane1,
-+		 uint32x2_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlah_lanehi (__a, __b, __c, __d);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqrdmlahh_laneq_s16 (int16_t __a, int16_t __b, int16x8_t __c, const int __d)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_lane_u64 (uint64x2_t __a, const int __lane1,
-+		 uint64x1_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlah_laneqhi (__a, __b, __c, __d);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqrdmlahs_s32 (int32_t __a, int32_t __b, int32_t __c)
-+/* vcopyq_laneq.  */
-+
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_f32 (float32x4_t __a, const int __lane1,
-+		  float32x4_t __b, const int __lane2)
- {
--  return (int32_t) __builtin_aarch64_sqrdmlahsi (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqrdmlahs_lane_s32 (int32_t __a, int32_t __b, int32x2_t __c, const int __d)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_f64 (float64x2_t __a, const int __lane1,
-+		  float64x2_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlah_lanesi (__a, __b, __c, __d);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqrdmlahs_laneq_s32 (int32_t __a, int32_t __b, int32x4_t __c, const int __d)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_p8 (poly8x16_t __a, const int __lane1,
-+		 poly8x16_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlah_laneqsi (__a, __b, __c, __d);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqrdmlsh_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_p16 (poly16x8_t __a, const int __lane1,
-+		  poly16x8_t __b, const int __lane2)
- {
--  return  __builtin_aarch64_sqrdmlsh_lanev4hi (__a, __b, __c, __d);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqrdmlsh_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_p64 (poly64x2_t __a, const int __lane1,
-+		  poly64x2_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlsh_lanev2si (__a, __b, __c, __d);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vqrdmlshq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_s8 (int8x16_t __a, const int __lane1,
-+		 int8x16_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlsh_lanev8hi (__a, __b, __c, __d);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqrdmlshq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_s16 (int16x8_t __a, const int __lane1,
-+		  int16x8_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlsh_lanev4si (__a, __b, __c, __d);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqrdmlshh_s16 (int16_t __a, int16_t __b, int16_t __c)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_s32 (int32x4_t __a, const int __lane1,
-+		  int32x4_t __b, const int __lane2)
- {
--  return (int16_t) __builtin_aarch64_sqrdmlshhi (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqrdmlshh_lane_s16 (int16_t __a, int16_t __b, int16x4_t __c, const int __d)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_s64 (int64x2_t __a, const int __lane1,
-+		  int64x2_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlsh_lanehi (__a, __b, __c, __d);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqrdmlshh_laneq_s16 (int16_t __a, int16_t __b, int16x8_t __c, const int __d)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_u8 (uint8x16_t __a, const int __lane1,
-+		 uint8x16_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlsh_laneqhi (__a, __b, __c, __d);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				  __a, __lane1);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqrdmlshs_s32 (int32_t __a, int32_t __b, int32_t __c)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_u16 (uint16x8_t __a, const int __lane1,
-+		  uint16x8_t __b, const int __lane2)
- {
--  return (int32_t) __builtin_aarch64_sqrdmlshsi (__a, __b, __c);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqrdmlshs_lane_s32 (int32_t __a, int32_t __b, int32x2_t __c, const int __d)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_u32 (uint32x4_t __a, const int __lane1,
-+		  uint32x4_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlsh_lanesi (__a, __b, __c, __d);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqrdmlshs_laneq_s32 (int32_t __a, int32_t __b, int32x4_t __c, const int __d)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcopyq_laneq_u64 (uint64x2_t __a, const int __lane1,
-+		  uint64x2_t __b, const int __lane2)
- {
--  return __builtin_aarch64_sqrdmlsh_laneqsi (__a, __b, __c, __d);
-+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
-+				   __a, __lane1);
- }
--#pragma GCC pop_options
- 
--#pragma GCC push_options
--#pragma GCC target ("+nothing+crypto")
--/* vaes  */
-+/* vcvt (double -> float).  */
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vaeseq_u8 (uint8x16_t data, uint8x16_t key)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_f16_f32 (float32x4_t __a)
- {
--  return __builtin_aarch64_crypto_aesev16qi_uuu (data, key);
-+  return __builtin_aarch64_float_truncate_lo_v4hf (__a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vaesdq_u8 (uint8x16_t data, uint8x16_t key)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_high_f16_f32 (float16x4_t __a, float32x4_t __b)
- {
--  return __builtin_aarch64_crypto_aesdv16qi_uuu (data, key);
-+  return __builtin_aarch64_float_truncate_hi_v8hf (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vaesmcq_u8 (uint8x16_t data)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_f32_f64 (float64x2_t __a)
- {
--  return __builtin_aarch64_crypto_aesmcv16qi_uu (data);
-+  return __builtin_aarch64_float_truncate_lo_v2sf (__a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vaesimcq_u8 (uint8x16_t data)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_high_f32_f64 (float32x2_t __a, float64x2_t __b)
- {
--  return __builtin_aarch64_crypto_aesimcv16qi_uu (data);
-+  return __builtin_aarch64_float_truncate_hi_v4sf (__a, __b);
- }
--#pragma GCC pop_options
- 
--/* vcage  */
-+/* vcvt (float -> double).  */
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcage_f64 (float64x1_t __a, float64x1_t __b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_f32_f16 (float16x4_t __a)
- {
--  return vabs_f64 (__a) >= vabs_f64 (__b);
-+  return __builtin_aarch64_float_extend_lo_v4sf (__a);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vcages_f32 (float32_t __a, float32_t __b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_f64_f32 (float32x2_t __a)
- {
--  return __builtin_fabsf (__a) >= __builtin_fabsf (__b) ? -1 : 0;
-+
-+  return __builtin_aarch64_float_extend_lo_v2df (__a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcage_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_high_f32_f16 (float16x8_t __a)
- {
--  return vabs_f32 (__a) >= vabs_f32 (__b);
-+  return __builtin_aarch64_vec_unpacks_hi_v8hf (__a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcageq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_high_f64_f32 (float32x4_t __a)
- {
--  return vabsq_f32 (__a) >= vabsq_f32 (__b);
-+  return __builtin_aarch64_vec_unpacks_hi_v4sf (__a);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcaged_f64 (float64_t __a, float64_t __b)
-+/* vcvt (<u>fixed-point -> float).  */
-+
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtd_n_f64_s64 (int64_t __a, const int __b)
- {
--  return __builtin_fabs (__a) >= __builtin_fabs (__b) ? -1 : 0;
-+  return __builtin_aarch64_scvtfdi (__a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcageq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtd_n_f64_u64 (uint64_t __a, const int __b)
- {
--  return vabsq_f64 (__a) >= vabsq_f64 (__b);
-+  return __builtin_aarch64_ucvtfdi_sus (__a, __b);
- }
- 
--/* vcagt  */
--
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vcagts_f32 (float32_t __a, float32_t __b)
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvts_n_f32_s32 (int32_t __a, const int __b)
- {
--  return __builtin_fabsf (__a) > __builtin_fabsf (__b) ? -1 : 0;
-+  return __builtin_aarch64_scvtfsi (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcagt_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvts_n_f32_u32 (uint32_t __a, const int __b)
- {
--  return vabs_f32 (__a) > vabs_f32 (__b);
-+  return __builtin_aarch64_ucvtfsi_sus (__a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcagt_f64 (float64x1_t __a, float64x1_t __b)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_n_f32_s32 (int32x2_t __a, const int __b)
- {
--  return vabs_f64 (__a) > vabs_f64 (__b);
-+  return __builtin_aarch64_scvtfv2si (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcagtq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_n_f32_u32 (uint32x2_t __a, const int __b)
- {
--  return vabsq_f32 (__a) > vabsq_f32 (__b);
-+  return __builtin_aarch64_ucvtfv2si_sus (__a, __b);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcagtd_f64 (float64_t __a, float64_t __b)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_n_f64_s64 (int64x1_t __a, const int __b)
- {
--  return __builtin_fabs (__a) > __builtin_fabs (__b) ? -1 : 0;
-+  return (float64x1_t)
-+    { __builtin_aarch64_scvtfdi (vget_lane_s64 (__a, 0), __b) };
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcagtq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_n_f64_u64 (uint64x1_t __a, const int __b)
- {
--  return vabsq_f64 (__a) > vabsq_f64 (__b);
-+  return (float64x1_t)
-+    { __builtin_aarch64_ucvtfdi_sus (vget_lane_u64 (__a, 0), __b) };
- }
- 
--/* vcale  */
--
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcale_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_n_f32_s32 (int32x4_t __a, const int __b)
- {
--  return vabs_f32 (__a) <= vabs_f32 (__b);
-+  return __builtin_aarch64_scvtfv4si (__a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcale_f64 (float64x1_t __a, float64x1_t __b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_n_f32_u32 (uint32x4_t __a, const int __b)
- {
--  return vabs_f64 (__a) <= vabs_f64 (__b);
-+  return __builtin_aarch64_ucvtfv4si_sus (__a, __b);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcaled_f64 (float64_t __a, float64_t __b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_n_f64_s64 (int64x2_t __a, const int __b)
- {
--  return __builtin_fabs (__a) <= __builtin_fabs (__b) ? -1 : 0;
-+  return __builtin_aarch64_scvtfv2di (__a, __b);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vcales_f32 (float32_t __a, float32_t __b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_n_f64_u64 (uint64x2_t __a, const int __b)
- {
--  return __builtin_fabsf (__a) <= __builtin_fabsf (__b) ? -1 : 0;
-+  return __builtin_aarch64_ucvtfv2di_sus (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcaleq_f32 (float32x4_t __a, float32x4_t __b)
-+/* vcvt (float -> <u>fixed-point).  */
-+
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtd_n_s64_f64 (float64_t __a, const int __b)
- {
--  return vabsq_f32 (__a) <= vabsq_f32 (__b);
-+  return __builtin_aarch64_fcvtzsdf (__a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcaleq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtd_n_u64_f64 (float64_t __a, const int __b)
- {
--  return vabsq_f64 (__a) <= vabsq_f64 (__b);
-+  return __builtin_aarch64_fcvtzudf_uss (__a, __b);
- }
- 
--/* vcalt  */
--
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcalt_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvts_n_s32_f32 (float32_t __a, const int __b)
- {
--  return vabs_f32 (__a) < vabs_f32 (__b);
-+  return __builtin_aarch64_fcvtzssf (__a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcalt_f64 (float64x1_t __a, float64x1_t __b)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvts_n_u32_f32 (float32_t __a, const int __b)
- {
--  return vabs_f64 (__a) < vabs_f64 (__b);
-+  return __builtin_aarch64_fcvtzusf_uss (__a, __b);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcaltd_f64 (float64_t __a, float64_t __b)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_n_s32_f32 (float32x2_t __a, const int __b)
- {
--  return __builtin_fabs (__a) < __builtin_fabs (__b) ? -1 : 0;
-+  return __builtin_aarch64_fcvtzsv2sf (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcaltq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_n_u32_f32 (float32x2_t __a, const int __b)
- {
--  return vabsq_f32 (__a) < vabsq_f32 (__b);
-+  return __builtin_aarch64_fcvtzuv2sf_uss (__a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcaltq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_n_s64_f64 (float64x1_t __a, const int __b)
- {
--  return vabsq_f64 (__a) < vabsq_f64 (__b);
-+  return (int64x1_t)
-+    { __builtin_aarch64_fcvtzsdf (vget_lane_f64 (__a, 0), __b) };
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vcalts_f32 (float32_t __a, float32_t __b)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_n_u64_f64 (float64x1_t __a, const int __b)
- {
--  return __builtin_fabsf (__a) < __builtin_fabsf (__b) ? -1 : 0;
-+  return (uint64x1_t)
-+    { __builtin_aarch64_fcvtzudf_uss (vget_lane_f64 (__a, 0), __b) };
- }
- 
--/* vceq - vector.  */
--
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vceq_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_n_s32_f32 (float32x4_t __a, const int __b)
- {
--  return (uint32x2_t) (__a == __b);
-+  return __builtin_aarch64_fcvtzsv4sf (__a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vceq_f64 (float64x1_t __a, float64x1_t __b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_n_u32_f32 (float32x4_t __a, const int __b)
- {
--  return (uint64x1_t) (__a == __b);
-+  return __builtin_aarch64_fcvtzuv4sf_uss (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vceq_p8 (poly8x8_t __a, poly8x8_t __b)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_n_s64_f64 (float64x2_t __a, const int __b)
- {
--  return (uint8x8_t) (__a == __b);
-+  return __builtin_aarch64_fcvtzsv2df (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vceq_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_n_u64_f64 (float64x2_t __a, const int __b)
- {
--  return (uint8x8_t) (__a == __b);
-+  return __builtin_aarch64_fcvtzuv2df_uss (__a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vceq_s16 (int16x4_t __a, int16x4_t __b)
-+/* vcvt  (<u>int -> float)  */
-+
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtd_f64_s64 (int64_t __a)
- {
--  return (uint16x4_t) (__a == __b);
-+  return (float64_t) __a;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vceq_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtd_f64_u64 (uint64_t __a)
- {
--  return (uint32x2_t) (__a == __b);
-+  return (float64_t) __a;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vceq_s64 (int64x1_t __a, int64x1_t __b)
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvts_f32_s32 (int32_t __a)
- {
--  return (uint64x1_t) (__a == __b);
-+  return (float32_t) __a;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vceq_u8 (uint8x8_t __a, uint8x8_t __b)
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvts_f32_u32 (uint32_t __a)
- {
--  return (__a == __b);
-+  return (float32_t) __a;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vceq_u16 (uint16x4_t __a, uint16x4_t __b)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_f32_s32 (int32x2_t __a)
- {
--  return (__a == __b);
-+  return __builtin_aarch64_floatv2siv2sf (__a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vceq_u32 (uint32x2_t __a, uint32x2_t __b)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_f32_u32 (uint32x2_t __a)
- {
--  return (__a == __b);
-+  return __builtin_aarch64_floatunsv2siv2sf ((int32x2_t) __a);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vceq_u64 (uint64x1_t __a, uint64x1_t __b)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_f64_s64 (int64x1_t __a)
- {
--  return (__a == __b);
-+  return (float64x1_t) { vget_lane_s64 (__a, 0) };
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vceqq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_f64_u64 (uint64x1_t __a)
- {
--  return (uint32x4_t) (__a == __b);
-+  return (float64x1_t) { vget_lane_u64 (__a, 0) };
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vceqq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_f32_s32 (int32x4_t __a)
- {
--  return (uint64x2_t) (__a == __b);
-+  return __builtin_aarch64_floatv4siv4sf (__a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vceqq_p8 (poly8x16_t __a, poly8x16_t __b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_f32_u32 (uint32x4_t __a)
- {
--  return (uint8x16_t) (__a == __b);
-+  return __builtin_aarch64_floatunsv4siv4sf ((int32x4_t) __a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vceqq_s8 (int8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_f64_s64 (int64x2_t __a)
- {
--  return (uint8x16_t) (__a == __b);
-+  return __builtin_aarch64_floatv2div2df (__a);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vceqq_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_f64_u64 (uint64x2_t __a)
- {
--  return (uint16x8_t) (__a == __b);
-+  return __builtin_aarch64_floatunsv2div2df ((int64x2_t) __a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vceqq_s32 (int32x4_t __a, int32x4_t __b)
-+/* vcvt (float -> <u>int)  */
-+
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtd_s64_f64 (float64_t __a)
- {
--  return (uint32x4_t) (__a == __b);
-+  return (int64_t) __a;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vceqq_s64 (int64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtd_u64_f64 (float64_t __a)
- {
--  return (uint64x2_t) (__a == __b);
-+  return (uint64_t) __a;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vceqq_u8 (uint8x16_t __a, uint8x16_t __b)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvts_s32_f32 (float32_t __a)
- {
--  return (__a == __b);
-+  return (int32_t) __a;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vceqq_u16 (uint16x8_t __a, uint16x8_t __b)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvts_u32_f32 (float32_t __a)
- {
--  return (__a == __b);
-+  return (uint32_t) __a;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vceqq_u32 (uint32x4_t __a, uint32x4_t __b)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_s32_f32 (float32x2_t __a)
- {
--  return (__a == __b);
-+  return __builtin_aarch64_lbtruncv2sfv2si (__a);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vceqq_u64 (uint64x2_t __a, uint64x2_t __b)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_u32_f32 (float32x2_t __a)
- {
--  return (__a == __b);
-+  return __builtin_aarch64_lbtruncuv2sfv2si_us (__a);
- }
- 
--/* vceq - scalar.  */
--
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vceqs_f32 (float32_t __a, float32_t __b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_s32_f32 (float32x4_t __a)
- {
--  return __a == __b ? -1 : 0;
-+  return __builtin_aarch64_lbtruncv4sfv4si (__a);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vceqd_s64 (int64_t __a, int64_t __b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_u32_f32 (float32x4_t __a)
- {
--  return __a == __b ? -1ll : 0ll;
-+  return __builtin_aarch64_lbtruncuv4sfv4si_us (__a);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vceqd_u64 (uint64_t __a, uint64_t __b)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_s64_f64 (float64x1_t __a)
- {
--  return __a == __b ? -1ll : 0ll;
-+  return (int64x1_t) {vcvtd_s64_f64 (__a[0])};
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vceqd_f64 (float64_t __a, float64_t __b)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_u64_f64 (float64x1_t __a)
- {
--  return __a == __b ? -1ll : 0ll;
-+  return (uint64x1_t) {vcvtd_u64_f64 (__a[0])};
- }
- 
--/* vceqz - vector.  */
--
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vceqz_f32 (float32x2_t __a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_s64_f64 (float64x2_t __a)
- {
--  return (uint32x2_t) (__a == 0.0f);
-+  return __builtin_aarch64_lbtruncv2dfv2di (__a);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vceqz_f64 (float64x1_t __a)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_u64_f64 (float64x2_t __a)
- {
--  return (uint64x1_t) (__a == (float64x1_t) {0.0});
-+  return __builtin_aarch64_lbtruncuv2dfv2di_us (__a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vceqz_p8 (poly8x8_t __a)
-+/* vcvta  */
-+
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtad_s64_f64 (float64_t __a)
- {
--  return (uint8x8_t) (__a == 0);
-+  return __builtin_aarch64_lrounddfdi (__a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vceqz_s8 (int8x8_t __a)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtad_u64_f64 (float64_t __a)
- {
--  return (uint8x8_t) (__a == 0);
-+  return __builtin_aarch64_lroundudfdi_us (__a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vceqz_s16 (int16x4_t __a)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtas_s32_f32 (float32_t __a)
- {
--  return (uint16x4_t) (__a == 0);
-+  return __builtin_aarch64_lroundsfsi (__a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vceqz_s32 (int32x2_t __a)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtas_u32_f32 (float32_t __a)
- {
--  return (uint32x2_t) (__a == 0);
-+  return __builtin_aarch64_lroundusfsi_us (__a);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vceqz_s64 (int64x1_t __a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvta_s32_f32 (float32x2_t __a)
- {
--  return (uint64x1_t) (__a == __AARCH64_INT64_C (0));
-+  return __builtin_aarch64_lroundv2sfv2si (__a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vceqz_u8 (uint8x8_t __a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvta_u32_f32 (float32x2_t __a)
- {
--  return (__a == 0);
-+  return __builtin_aarch64_lrounduv2sfv2si_us (__a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vceqz_u16 (uint16x4_t __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtaq_s32_f32 (float32x4_t __a)
- {
--  return (__a == 0);
-+  return __builtin_aarch64_lroundv4sfv4si (__a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vceqz_u32 (uint32x2_t __a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtaq_u32_f32 (float32x4_t __a)
- {
--  return (__a == 0);
-+  return __builtin_aarch64_lrounduv4sfv4si_us (__a);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vceqz_u64 (uint64x1_t __a)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvta_s64_f64 (float64x1_t __a)
- {
--  return (__a == __AARCH64_UINT64_C (0));
-+  return (int64x1_t) {vcvtad_s64_f64 (__a[0])};
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vceqzq_f32 (float32x4_t __a)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvta_u64_f64 (float64x1_t __a)
- {
--  return (uint32x4_t) (__a == 0.0f);
-+  return (uint64x1_t) {vcvtad_u64_f64 (__a[0])};
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vceqzq_f64 (float64x2_t __a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtaq_s64_f64 (float64x2_t __a)
- {
--  return (uint64x2_t) (__a == 0.0f);
-+  return __builtin_aarch64_lroundv2dfv2di (__a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vceqzq_p8 (poly8x16_t __a)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtaq_u64_f64 (float64x2_t __a)
- {
--  return (uint8x16_t) (__a == 0);
-+  return __builtin_aarch64_lrounduv2dfv2di_us (__a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vceqzq_s8 (int8x16_t __a)
-+/* vcvtm  */
-+
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtmd_s64_f64 (float64_t __a)
- {
--  return (uint8x16_t) (__a == 0);
-+  return __builtin_llfloor (__a);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vceqzq_s16 (int16x8_t __a)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtmd_u64_f64 (float64_t __a)
- {
--  return (uint16x8_t) (__a == 0);
-+  return __builtin_aarch64_lfloorudfdi_us (__a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vceqzq_s32 (int32x4_t __a)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtms_s32_f32 (float32_t __a)
- {
--  return (uint32x4_t) (__a == 0);
-+  return __builtin_ifloorf (__a);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vceqzq_s64 (int64x2_t __a)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtms_u32_f32 (float32_t __a)
- {
--  return (uint64x2_t) (__a == __AARCH64_INT64_C (0));
-+  return __builtin_aarch64_lfloorusfsi_us (__a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vceqzq_u8 (uint8x16_t __a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtm_s32_f32 (float32x2_t __a)
- {
--  return (__a == 0);
-+  return __builtin_aarch64_lfloorv2sfv2si (__a);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vceqzq_u16 (uint16x8_t __a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtm_u32_f32 (float32x2_t __a)
- {
--  return (__a == 0);
-+  return __builtin_aarch64_lflooruv2sfv2si_us (__a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vceqzq_u32 (uint32x4_t __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtmq_s32_f32 (float32x4_t __a)
- {
--  return (__a == 0);
-+  return __builtin_aarch64_lfloorv4sfv4si (__a);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vceqzq_u64 (uint64x2_t __a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtmq_u32_f32 (float32x4_t __a)
- {
--  return (__a == __AARCH64_UINT64_C (0));
-+  return __builtin_aarch64_lflooruv4sfv4si_us (__a);
- }
- 
--/* vceqz - scalar.  */
--
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vceqzs_f32 (float32_t __a)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtm_s64_f64 (float64x1_t __a)
- {
--  return __a == 0.0f ? -1 : 0;
-+  return (int64x1_t) {vcvtmd_s64_f64 (__a[0])};
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vceqzd_s64 (int64_t __a)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtm_u64_f64 (float64x1_t __a)
- {
--  return __a == 0 ? -1ll : 0ll;
-+  return (uint64x1_t) {vcvtmd_u64_f64 (__a[0])};
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vceqzd_u64 (uint64_t __a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtmq_s64_f64 (float64x2_t __a)
- {
--  return __a == 0 ? -1ll : 0ll;
-+  return __builtin_aarch64_lfloorv2dfv2di (__a);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vceqzd_f64 (float64_t __a)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtmq_u64_f64 (float64x2_t __a)
- {
--  return __a == 0.0 ? -1ll : 0ll;
-+  return __builtin_aarch64_lflooruv2dfv2di_us (__a);
- }
- 
--/* vcge - vector.  */
-+/* vcvtn  */
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcge_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtnd_s64_f64 (float64_t __a)
- {
--  return (uint32x2_t) (__a >= __b);
-+  return __builtin_aarch64_lfrintndfdi (__a);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcge_f64 (float64x1_t __a, float64x1_t __b)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtnd_u64_f64 (float64_t __a)
- {
--  return (uint64x1_t) (__a >= __b);
-+  return __builtin_aarch64_lfrintnudfdi_us (__a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vcge_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtns_s32_f32 (float32_t __a)
- {
--  return (uint8x8_t) (__a >= __b);
-+  return __builtin_aarch64_lfrintnsfsi (__a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vcge_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtns_u32_f32 (float32_t __a)
- {
--  return (uint16x4_t) (__a >= __b);
-+  return __builtin_aarch64_lfrintnusfsi_us (__a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcge_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtn_s32_f32 (float32x2_t __a)
- {
--  return (uint32x2_t) (__a >= __b);
-+  return __builtin_aarch64_lfrintnv2sfv2si (__a);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcge_s64 (int64x1_t __a, int64x1_t __b)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtn_u32_f32 (float32x2_t __a)
- {
--  return (uint64x1_t) (__a >= __b);
-+  return __builtin_aarch64_lfrintnuv2sfv2si_us (__a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vcge_u8 (uint8x8_t __a, uint8x8_t __b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtnq_s32_f32 (float32x4_t __a)
- {
--  return (__a >= __b);
-+  return __builtin_aarch64_lfrintnv4sfv4si (__a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vcge_u16 (uint16x4_t __a, uint16x4_t __b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtnq_u32_f32 (float32x4_t __a)
- {
--  return (__a >= __b);
-+  return __builtin_aarch64_lfrintnuv4sfv4si_us (__a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcge_u32 (uint32x2_t __a, uint32x2_t __b)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtn_s64_f64 (float64x1_t __a)
- {
--  return (__a >= __b);
-+  return (int64x1_t) {vcvtnd_s64_f64 (__a[0])};
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcge_u64 (uint64x1_t __a, uint64x1_t __b)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtn_u64_f64 (float64x1_t __a)
- {
--  return (__a >= __b);
-+  return (uint64x1_t) {vcvtnd_u64_f64 (__a[0])};
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcgeq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtnq_s64_f64 (float64x2_t __a)
- {
--  return (uint32x4_t) (__a >= __b);
-+  return __builtin_aarch64_lfrintnv2dfv2di (__a);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcgeq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtnq_u64_f64 (float64x2_t __a)
- {
--  return (uint64x2_t) (__a >= __b);
-+  return __builtin_aarch64_lfrintnuv2dfv2di_us (__a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vcgeq_s8 (int8x16_t __a, int8x16_t __b)
-+/* vcvtp  */
-+
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtpd_s64_f64 (float64_t __a)
- {
--  return (uint8x16_t) (__a >= __b);
-+  return __builtin_llceil (__a);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vcgeq_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtpd_u64_f64 (float64_t __a)
- {
--  return (uint16x8_t) (__a >= __b);
-+  return __builtin_aarch64_lceiludfdi_us (__a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcgeq_s32 (int32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtps_s32_f32 (float32_t __a)
- {
--  return (uint32x4_t) (__a >= __b);
-+  return __builtin_iceilf (__a);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcgeq_s64 (int64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtps_u32_f32 (float32_t __a)
- {
--  return (uint64x2_t) (__a >= __b);
-+  return __builtin_aarch64_lceilusfsi_us (__a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vcgeq_u8 (uint8x16_t __a, uint8x16_t __b)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtp_s32_f32 (float32x2_t __a)
- {
--  return (__a >= __b);
-+  return __builtin_aarch64_lceilv2sfv2si (__a);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vcgeq_u16 (uint16x8_t __a, uint16x8_t __b)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtp_u32_f32 (float32x2_t __a)
- {
--  return (__a >= __b);
-+  return __builtin_aarch64_lceiluv2sfv2si_us (__a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcgeq_u32 (uint32x4_t __a, uint32x4_t __b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtpq_s32_f32 (float32x4_t __a)
- {
--  return (__a >= __b);
-+  return __builtin_aarch64_lceilv4sfv4si (__a);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcgeq_u64 (uint64x2_t __a, uint64x2_t __b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtpq_u32_f32 (float32x4_t __a)
- {
--  return (__a >= __b);
-+  return __builtin_aarch64_lceiluv4sfv4si_us (__a);
- }
- 
--/* vcge - scalar.  */
--
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vcges_f32 (float32_t __a, float32_t __b)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtp_s64_f64 (float64x1_t __a)
- {
--  return __a >= __b ? -1 : 0;
-+  return (int64x1_t) {vcvtpd_s64_f64 (__a[0])};
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcged_s64 (int64_t __a, int64_t __b)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtp_u64_f64 (float64x1_t __a)
- {
--  return __a >= __b ? -1ll : 0ll;
-+  return (uint64x1_t) {vcvtpd_u64_f64 (__a[0])};
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcged_u64 (uint64_t __a, uint64_t __b)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtpq_s64_f64 (float64x2_t __a)
- {
--  return __a >= __b ? -1ll : 0ll;
-+  return __builtin_aarch64_lceilv2dfv2di (__a);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcged_f64 (float64_t __a, float64_t __b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtpq_u64_f64 (float64x2_t __a)
- {
--  return __a >= __b ? -1ll : 0ll;
-+  return __builtin_aarch64_lceiluv2dfv2di_us (__a);
- }
- 
--/* vcgez - vector.  */
-+/* vdup_n  */
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcgez_f32 (float32x2_t __a)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_f16 (float16_t __a)
- {
--  return (uint32x2_t) (__a >= 0.0f);
-+  return (float16x4_t) {__a, __a, __a, __a};
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcgez_f64 (float64x1_t __a)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_f32 (float32_t __a)
- {
--  return (uint64x1_t) (__a[0] >= (float64x1_t) {0.0});
-+  return (float32x2_t) {__a, __a};
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vcgez_s8 (int8x8_t __a)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_f64 (float64_t __a)
- {
--  return (uint8x8_t) (__a >= 0);
-+  return (float64x1_t) {__a};
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vcgez_s16 (int16x4_t __a)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_p8 (poly8_t __a)
- {
--  return (uint16x4_t) (__a >= 0);
-+  return (poly8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcgez_s32 (int32x2_t __a)
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_p16 (poly16_t __a)
- {
--  return (uint32x2_t) (__a >= 0);
-+  return (poly16x4_t) {__a, __a, __a, __a};
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcgez_s64 (int64x1_t __a)
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_p64 (poly64_t __a)
- {
--  return (uint64x1_t) (__a >= __AARCH64_INT64_C (0));
-+  return (poly64x1_t) {__a};
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcgezq_f32 (float32x4_t __a)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_s8 (int8_t __a)
- {
--  return (uint32x4_t) (__a >= 0.0f);
-+  return (int8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcgezq_f64 (float64x2_t __a)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_s16 (int16_t __a)
- {
--  return (uint64x2_t) (__a >= 0.0);
-+  return (int16x4_t) {__a, __a, __a, __a};
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vcgezq_s8 (int8x16_t __a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_s32 (int32_t __a)
- {
--  return (uint8x16_t) (__a >= 0);
-+  return (int32x2_t) {__a, __a};
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vcgezq_s16 (int16x8_t __a)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_s64 (int64_t __a)
- {
--  return (uint16x8_t) (__a >= 0);
-+  return (int64x1_t) {__a};
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcgezq_s32 (int32x4_t __a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_u8 (uint8_t __a)
- {
--  return (uint32x4_t) (__a >= 0);
-+  return (uint8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcgezq_s64 (int64x2_t __a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_u16 (uint16_t __a)
- {
--  return (uint64x2_t) (__a >= __AARCH64_INT64_C (0));
-+  return (uint16x4_t) {__a, __a, __a, __a};
- }
- 
--/* vcgez - scalar.  */
--
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vcgezs_f32 (float32_t __a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_u32 (uint32_t __a)
- {
--  return __a >= 0.0f ? -1 : 0;
-+  return (uint32x2_t) {__a, __a};
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcgezd_s64 (int64_t __a)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_u64 (uint64_t __a)
- {
--  return __a >= 0 ? -1ll : 0ll;
-+  return (uint64x1_t) {__a};
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcgezd_f64 (float64_t __a)
-+/* vdupq_n  */
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_f16 (float16_t __a)
- {
--  return __a >= 0.0 ? -1ll : 0ll;
-+  return (float16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
- }
- 
--/* vcgt - vector.  */
--
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcgt_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_f32 (float32_t __a)
- {
--  return (uint32x2_t) (__a > __b);
-+  return (float32x4_t) {__a, __a, __a, __a};
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcgt_f64 (float64x1_t __a, float64x1_t __b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_f64 (float64_t __a)
- {
--  return (uint64x1_t) (__a > __b);
-+  return (float64x2_t) {__a, __a};
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vcgt_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_p8 (uint32_t __a)
- {
--  return (uint8x8_t) (__a > __b);
-+  return (poly8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
-+		       __a, __a, __a, __a, __a, __a, __a, __a};
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vcgt_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_p16 (uint32_t __a)
- {
--  return (uint16x4_t) (__a > __b);
-+  return (poly16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcgt_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_p64 (uint64_t __a)
- {
--  return (uint32x2_t) (__a > __b);
-+  return (poly64x2_t) {__a, __a};
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcgt_s64 (int64x1_t __a, int64x1_t __b)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_s8 (int32_t __a)
- {
--  return (uint64x1_t) (__a > __b);
-+  return (int8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
-+		      __a, __a, __a, __a, __a, __a, __a, __a};
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vcgt_u8 (uint8x8_t __a, uint8x8_t __b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_s16 (int32_t __a)
- {
--  return (__a > __b);
-+  return (int16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vcgt_u16 (uint16x4_t __a, uint16x4_t __b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_s32 (int32_t __a)
- {
--  return (__a > __b);
-+  return (int32x4_t) {__a, __a, __a, __a};
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcgt_u32 (uint32x2_t __a, uint32x2_t __b)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_s64 (int64_t __a)
- {
--  return (__a > __b);
-+  return (int64x2_t) {__a, __a};
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcgt_u64 (uint64x1_t __a, uint64x1_t __b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_u8 (uint32_t __a)
- {
--  return (__a > __b);
-+  return (uint8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
-+		       __a, __a, __a, __a, __a, __a, __a, __a};
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcgtq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_u16 (uint32_t __a)
- {
--  return (uint32x4_t) (__a > __b);
-+  return (uint16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcgtq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_u32 (uint32_t __a)
- {
--  return (uint64x2_t) (__a > __b);
-+  return (uint32x4_t) {__a, __a, __a, __a};
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vcgtq_s8 (int8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_u64 (uint64_t __a)
- {
--  return (uint8x16_t) (__a > __b);
-+  return (uint64x2_t) {__a, __a};
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vcgtq_s16 (int16x8_t __a, int16x8_t __b)
-+/* vdup_lane  */
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_f16 (float16x4_t __a, const int __b)
- {
--  return (uint16x8_t) (__a > __b);
-+  return __aarch64_vdup_lane_f16 (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcgtq_s32 (int32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_f32 (float32x2_t __a, const int __b)
- {
--  return (uint32x4_t) (__a > __b);
-+  return __aarch64_vdup_lane_f32 (__a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcgtq_s64 (int64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_f64 (float64x1_t __a, const int __b)
- {
--  return (uint64x2_t) (__a > __b);
-+  return __aarch64_vdup_lane_f64 (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vcgtq_u8 (uint8x16_t __a, uint8x16_t __b)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_p8 (poly8x8_t __a, const int __b)
- {
--  return (__a > __b);
-+  return __aarch64_vdup_lane_p8 (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vcgtq_u16 (uint16x8_t __a, uint16x8_t __b)
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_p16 (poly16x4_t __a, const int __b)
- {
--  return (__a > __b);
-+  return __aarch64_vdup_lane_p16 (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcgtq_u32 (uint32x4_t __a, uint32x4_t __b)
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_p64 (poly64x1_t __a, const int __b)
- {
--  return (__a > __b);
-+  return __aarch64_vdup_lane_p64 (__a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcgtq_u64 (uint64x2_t __a, uint64x2_t __b)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_s8 (int8x8_t __a, const int __b)
- {
--  return (__a > __b);
-+  return __aarch64_vdup_lane_s8 (__a, __b);
- }
- 
--/* vcgt - scalar.  */
--
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vcgts_f32 (float32_t __a, float32_t __b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_s16 (int16x4_t __a, const int __b)
- {
--  return __a > __b ? -1 : 0;
-+  return __aarch64_vdup_lane_s16 (__a, __b);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcgtd_s64 (int64_t __a, int64_t __b)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_s32 (int32x2_t __a, const int __b)
- {
--  return __a > __b ? -1ll : 0ll;
-+  return __aarch64_vdup_lane_s32 (__a, __b);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcgtd_u64 (uint64_t __a, uint64_t __b)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_s64 (int64x1_t __a, const int __b)
- {
--  return __a > __b ? -1ll : 0ll;
-+  return __aarch64_vdup_lane_s64 (__a, __b);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcgtd_f64 (float64_t __a, float64_t __b)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_u8 (uint8x8_t __a, const int __b)
- {
--  return __a > __b ? -1ll : 0ll;
-+  return __aarch64_vdup_lane_u8 (__a, __b);
- }
- 
--/* vcgtz - vector.  */
--
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcgtz_f32 (float32x2_t __a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_u16 (uint16x4_t __a, const int __b)
- {
--  return (uint32x2_t) (__a > 0.0f);
-+  return __aarch64_vdup_lane_u16 (__a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcgtz_f64 (float64x1_t __a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_u32 (uint32x2_t __a, const int __b)
- {
--  return (uint64x1_t) (__a > (float64x1_t) {0.0});
-+  return __aarch64_vdup_lane_u32 (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vcgtz_s8 (int8x8_t __a)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_u64 (uint64x1_t __a, const int __b)
- {
--  return (uint8x8_t) (__a > 0);
-+  return __aarch64_vdup_lane_u64 (__a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vcgtz_s16 (int16x4_t __a)
-+/* vdup_laneq  */
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_f16 (float16x8_t __a, const int __b)
- {
--  return (uint16x4_t) (__a > 0);
-+  return __aarch64_vdup_laneq_f16 (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcgtz_s32 (int32x2_t __a)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_f32 (float32x4_t __a, const int __b)
- {
--  return (uint32x2_t) (__a > 0);
-+  return __aarch64_vdup_laneq_f32 (__a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcgtz_s64 (int64x1_t __a)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_f64 (float64x2_t __a, const int __b)
- {
--  return (uint64x1_t) (__a > __AARCH64_INT64_C (0));
-+  return __aarch64_vdup_laneq_f64 (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcgtzq_f32 (float32x4_t __a)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_p8 (poly8x16_t __a, const int __b)
- {
--  return (uint32x4_t) (__a > 0.0f);
-+  return __aarch64_vdup_laneq_p8 (__a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcgtzq_f64 (float64x2_t __a)
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_p16 (poly16x8_t __a, const int __b)
- {
--    return (uint64x2_t) (__a > 0.0);
-+  return __aarch64_vdup_laneq_p16 (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vcgtzq_s8 (int8x16_t __a)
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_p64 (poly64x2_t __a, const int __b)
- {
--  return (uint8x16_t) (__a > 0);
-+  return __aarch64_vdup_laneq_p64 (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vcgtzq_s16 (int16x8_t __a)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_s8 (int8x16_t __a, const int __b)
- {
--  return (uint16x8_t) (__a > 0);
-+  return __aarch64_vdup_laneq_s8 (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcgtzq_s32 (int32x4_t __a)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_s16 (int16x8_t __a, const int __b)
- {
--  return (uint32x4_t) (__a > 0);
-+  return __aarch64_vdup_laneq_s16 (__a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcgtzq_s64 (int64x2_t __a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_s32 (int32x4_t __a, const int __b)
- {
--  return (uint64x2_t) (__a > __AARCH64_INT64_C (0));
-+  return __aarch64_vdup_laneq_s32 (__a, __b);
- }
- 
--/* vcgtz - scalar.  */
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_s64 (int64x2_t __a, const int __b)
-+{
-+  return __aarch64_vdup_laneq_s64 (__a, __b);
-+}
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vcgtzs_f32 (float32_t __a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_u8 (uint8x16_t __a, const int __b)
- {
--  return __a > 0.0f ? -1 : 0;
-+  return __aarch64_vdup_laneq_u8 (__a, __b);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcgtzd_s64 (int64_t __a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_u16 (uint16x8_t __a, const int __b)
- {
--  return __a > 0 ? -1ll : 0ll;
-+  return __aarch64_vdup_laneq_u16 (__a, __b);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcgtzd_f64 (float64_t __a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_u32 (uint32x4_t __a, const int __b)
- {
--  return __a > 0.0 ? -1ll : 0ll;
-+  return __aarch64_vdup_laneq_u32 (__a, __b);
- }
- 
--/* vcle - vector.  */
--
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcle_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_laneq_u64 (uint64x2_t __a, const int __b)
- {
--  return (uint32x2_t) (__a <= __b);
-+  return __aarch64_vdup_laneq_u64 (__a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcle_f64 (float64x1_t __a, float64x1_t __b)
-+/* vdupq_lane  */
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_f16 (float16x4_t __a, const int __b)
- {
--  return (uint64x1_t) (__a <= __b);
-+  return __aarch64_vdupq_lane_f16 (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vcle_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_f32 (float32x2_t __a, const int __b)
- {
--  return (uint8x8_t) (__a <= __b);
-+  return __aarch64_vdupq_lane_f32 (__a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vcle_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_f64 (float64x1_t __a, const int __b)
- {
--  return (uint16x4_t) (__a <= __b);
-+  return __aarch64_vdupq_lane_f64 (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcle_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_p8 (poly8x8_t __a, const int __b)
- {
--  return (uint32x2_t) (__a <= __b);
-+  return __aarch64_vdupq_lane_p8 (__a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcle_s64 (int64x1_t __a, int64x1_t __b)
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_p16 (poly16x4_t __a, const int __b)
- {
--  return (uint64x1_t) (__a <= __b);
-+  return __aarch64_vdupq_lane_p16 (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vcle_u8 (uint8x8_t __a, uint8x8_t __b)
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_p64 (poly64x1_t __a, const int __b)
- {
--  return (__a <= __b);
-+  return __aarch64_vdupq_lane_p64 (__a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vcle_u16 (uint16x4_t __a, uint16x4_t __b)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_s8 (int8x8_t __a, const int __b)
- {
--  return (__a <= __b);
-+  return __aarch64_vdupq_lane_s8 (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcle_u32 (uint32x2_t __a, uint32x2_t __b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_s16 (int16x4_t __a, const int __b)
- {
--  return (__a <= __b);
-+  return __aarch64_vdupq_lane_s16 (__a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcle_u64 (uint64x1_t __a, uint64x1_t __b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_s32 (int32x2_t __a, const int __b)
- {
--  return (__a <= __b);
-+  return __aarch64_vdupq_lane_s32 (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcleq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_s64 (int64x1_t __a, const int __b)
- {
--  return (uint32x4_t) (__a <= __b);
-+  return __aarch64_vdupq_lane_s64 (__a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcleq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_u8 (uint8x8_t __a, const int __b)
- {
--  return (uint64x2_t) (__a <= __b);
-+  return __aarch64_vdupq_lane_u8 (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vcleq_s8 (int8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_u16 (uint16x4_t __a, const int __b)
- {
--  return (uint8x16_t) (__a <= __b);
-+  return __aarch64_vdupq_lane_u16 (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vcleq_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_u32 (uint32x2_t __a, const int __b)
- {
--  return (uint16x8_t) (__a <= __b);
-+  return __aarch64_vdupq_lane_u32 (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcleq_s32 (int32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_u64 (uint64x1_t __a, const int __b)
- {
--  return (uint32x4_t) (__a <= __b);
-+  return __aarch64_vdupq_lane_u64 (__a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcleq_s64 (int64x2_t __a, int64x2_t __b)
-+/* vdupq_laneq  */
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_f16 (float16x8_t __a, const int __b)
- {
--  return (uint64x2_t) (__a <= __b);
-+  return __aarch64_vdupq_laneq_f16 (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vcleq_u8 (uint8x16_t __a, uint8x16_t __b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_f32 (float32x4_t __a, const int __b)
- {
--  return (__a <= __b);
-+  return __aarch64_vdupq_laneq_f32 (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vcleq_u16 (uint16x8_t __a, uint16x8_t __b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_f64 (float64x2_t __a, const int __b)
- {
--  return (__a <= __b);
-+  return __aarch64_vdupq_laneq_f64 (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcleq_u32 (uint32x4_t __a, uint32x4_t __b)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_p8 (poly8x16_t __a, const int __b)
- {
--  return (__a <= __b);
-+  return __aarch64_vdupq_laneq_p8 (__a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcleq_u64 (uint64x2_t __a, uint64x2_t __b)
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_p16 (poly16x8_t __a, const int __b)
- {
--  return (__a <= __b);
-+  return __aarch64_vdupq_laneq_p16 (__a, __b);
- }
- 
--/* vcle - scalar.  */
--
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vcles_f32 (float32_t __a, float32_t __b)
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_p64 (poly64x2_t __a, const int __b)
- {
--  return __a <= __b ? -1 : 0;
-+  return __aarch64_vdupq_laneq_p64 (__a, __b);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcled_s64 (int64_t __a, int64_t __b)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_s8 (int8x16_t __a, const int __b)
- {
--  return __a <= __b ? -1ll : 0ll;
-+  return __aarch64_vdupq_laneq_s8 (__a, __b);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcled_u64 (uint64_t __a, uint64_t __b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_s16 (int16x8_t __a, const int __b)
- {
--  return __a <= __b ? -1ll : 0ll;
-+  return __aarch64_vdupq_laneq_s16 (__a, __b);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcled_f64 (float64_t __a, float64_t __b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_s32 (int32x4_t __a, const int __b)
- {
--  return __a <= __b ? -1ll : 0ll;
-+  return __aarch64_vdupq_laneq_s32 (__a, __b);
- }
- 
--/* vclez - vector.  */
--
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vclez_f32 (float32x2_t __a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_s64 (int64x2_t __a, const int __b)
- {
--  return (uint32x2_t) (__a <= 0.0f);
-+  return __aarch64_vdupq_laneq_s64 (__a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vclez_f64 (float64x1_t __a)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_u8 (uint8x16_t __a, const int __b)
- {
--  return (uint64x1_t) (__a <= (float64x1_t) {0.0});
-+  return __aarch64_vdupq_laneq_u8 (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vclez_s8 (int8x8_t __a)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_u16 (uint16x8_t __a, const int __b)
- {
--  return (uint8x8_t) (__a <= 0);
-+  return __aarch64_vdupq_laneq_u16 (__a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vclez_s16 (int16x4_t __a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_u32 (uint32x4_t __a, const int __b)
- {
--  return (uint16x4_t) (__a <= 0);
-+  return __aarch64_vdupq_laneq_u32 (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vclez_s32 (int32x2_t __a)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_laneq_u64 (uint64x2_t __a, const int __b)
- {
--  return (uint32x2_t) (__a <= 0);
-+  return __aarch64_vdupq_laneq_u64 (__a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vclez_s64 (int64x1_t __a)
-+/* vdupb_lane  */
-+__extension__ extern __inline poly8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupb_lane_p8 (poly8x8_t __a, const int __b)
- {
--  return (uint64x1_t) (__a <= __AARCH64_INT64_C (0));
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vclezq_f32 (float32x4_t __a)
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupb_lane_s8 (int8x8_t __a, const int __b)
- {
--  return (uint32x4_t) (__a <= 0.0f);
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vclezq_f64 (float64x2_t __a)
-+__extension__ extern __inline uint8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupb_lane_u8 (uint8x8_t __a, const int __b)
- {
--  return (uint64x2_t) (__a <= 0.0);
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vclezq_s8 (int8x16_t __a)
-+/* vduph_lane  */
-+
-+__extension__ extern __inline float16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vduph_lane_f16 (float16x4_t __a, const int __b)
- {
--  return (uint8x16_t) (__a <= 0);
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vclezq_s16 (int16x8_t __a)
-+__extension__ extern __inline poly16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vduph_lane_p16 (poly16x4_t __a, const int __b)
- {
--  return (uint16x8_t) (__a <= 0);
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vclezq_s32 (int32x4_t __a)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vduph_lane_s16 (int16x4_t __a, const int __b)
- {
--  return (uint32x4_t) (__a <= 0);
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vclezq_s64 (int64x2_t __a)
-+__extension__ extern __inline uint16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vduph_lane_u16 (uint16x4_t __a, const int __b)
- {
--  return (uint64x2_t) (__a <= __AARCH64_INT64_C (0));
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--/* vclez - scalar.  */
-+/* vdups_lane  */
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vclezs_f32 (float32_t __a)
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdups_lane_f32 (float32x2_t __a, const int __b)
- {
--  return __a <= 0.0f ? -1 : 0;
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vclezd_s64 (int64_t __a)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdups_lane_s32 (int32x2_t __a, const int __b)
- {
--  return __a <= 0 ? -1ll : 0ll;
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vclezd_f64 (float64_t __a)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdups_lane_u32 (uint32x2_t __a, const int __b)
- {
--  return __a <= 0.0 ? -1ll : 0ll;
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--/* vclt - vector.  */
--
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vclt_f32 (float32x2_t __a, float32x2_t __b)
-+/* vdupd_lane  */
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupd_lane_f64 (float64x1_t __a, const int __b)
- {
--  return (uint32x2_t) (__a < __b);
-+  __AARCH64_LANE_CHECK (__a, __b);
-+  return __a[0];
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vclt_f64 (float64x1_t __a, float64x1_t __b)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupd_lane_s64 (int64x1_t __a, const int __b)
- {
--  return (uint64x1_t) (__a < __b);
-+  __AARCH64_LANE_CHECK (__a, __b);
-+  return __a[0];
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vclt_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupd_lane_u64 (uint64x1_t __a, const int __b)
- {
--  return (uint8x8_t) (__a < __b);
-+  __AARCH64_LANE_CHECK (__a, __b);
-+  return __a[0];
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vclt_s16 (int16x4_t __a, int16x4_t __b)
-+/* vdupb_laneq  */
-+__extension__ extern __inline poly8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupb_laneq_p8 (poly8x16_t __a, const int __b)
- {
--  return (uint16x4_t) (__a < __b);
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vclt_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupb_laneq_s8 (int8x16_t __a, const int __b)
- {
--  return (uint32x2_t) (__a < __b);
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vclt_s64 (int64x1_t __a, int64x1_t __b)
-+__extension__ extern __inline uint8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupb_laneq_u8 (uint8x16_t __a, const int __b)
- {
--  return (uint64x1_t) (__a < __b);
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vclt_u8 (uint8x8_t __a, uint8x8_t __b)
--{
--  return (__a < __b);
--}
-+/* vduph_laneq  */
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vclt_u16 (uint16x4_t __a, uint16x4_t __b)
-+__extension__ extern __inline float16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vduph_laneq_f16 (float16x8_t __a, const int __b)
- {
--  return (__a < __b);
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vclt_u32 (uint32x2_t __a, uint32x2_t __b)
-+__extension__ extern __inline poly16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vduph_laneq_p16 (poly16x8_t __a, const int __b)
- {
--  return (__a < __b);
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vclt_u64 (uint64x1_t __a, uint64x1_t __b)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vduph_laneq_s16 (int16x8_t __a, const int __b)
- {
--  return (__a < __b);
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcltq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline uint16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vduph_laneq_u16 (uint16x8_t __a, const int __b)
- {
--  return (uint32x4_t) (__a < __b);
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcltq_f64 (float64x2_t __a, float64x2_t __b)
-+/* vdups_laneq  */
-+
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdups_laneq_f32 (float32x4_t __a, const int __b)
- {
--  return (uint64x2_t) (__a < __b);
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vcltq_s8 (int8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdups_laneq_s32 (int32x4_t __a, const int __b)
- {
--  return (uint8x16_t) (__a < __b);
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vcltq_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdups_laneq_u32 (uint32x4_t __a, const int __b)
- {
--  return (uint16x8_t) (__a < __b);
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcltq_s32 (int32x4_t __a, int32x4_t __b)
-+/* vdupd_laneq  */
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupd_laneq_f64 (float64x2_t __a, const int __b)
- {
--  return (uint32x4_t) (__a < __b);
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcltq_s64 (int64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupd_laneq_s64 (int64x2_t __a, const int __b)
- {
--  return (uint64x2_t) (__a < __b);
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vcltq_u8 (uint8x16_t __a, uint8x16_t __b)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupd_laneq_u64 (uint64x2_t __a, const int __b)
- {
--  return (__a < __b);
-+  return __aarch64_vget_lane_any (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vcltq_u16 (uint16x8_t __a, uint16x8_t __b)
-+/* vext  */
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_f16 (float16x4_t __a, float16x4_t __b, __const int __c)
- {
--  return (__a < __b);
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a,
-+			    (uint16x4_t) {4 - __c, 5 - __c, 6 - __c, 7 - __c});
-+#else
-+  return __builtin_shuffle (__a, __b,
-+			    (uint16x4_t) {__c, __c + 1, __c + 2, __c + 3});
-+#endif
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcltq_u32 (uint32x4_t __a, uint32x4_t __b)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_f32 (float32x2_t __a, float32x2_t __b, __const int __c)
- {
--  return (__a < __b);
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a, (uint32x2_t) {2-__c, 3-__c});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {__c, __c+1});
-+#endif
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcltq_u64 (uint64x2_t __a, uint64x2_t __b)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_f64 (float64x1_t __a, float64x1_t __b, __const int __c)
- {
--  return (__a < __b);
-+  __AARCH64_LANE_CHECK (__a, __c);
-+  /* The only possible index to the assembler instruction returns element 0.  */
-+  return __a;
- }
--
--/* vclt - scalar.  */
--
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vclts_f32 (float32_t __a, float32_t __b)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_p8 (poly8x8_t __a, poly8x8_t __b, __const int __c)
- {
--  return __a < __b ? -1 : 0;
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a, (uint8x8_t)
-+      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
-+#else
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
-+#endif
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcltd_s64 (int64_t __a, int64_t __b)
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_p16 (poly16x4_t __a, poly16x4_t __b, __const int __c)
- {
--  return __a < __b ? -1ll : 0ll;
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a,
-+      (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3});
-+#endif
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcltd_u64 (uint64_t __a, uint64_t __b)
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_p64 (poly64x1_t __a, poly64x1_t __b, __const int __c)
- {
--  return __a < __b ? -1ll : 0ll;
-+  __AARCH64_LANE_CHECK (__a, __c);
-+  /* The only possible index to the assembler instruction returns element 0.  */
-+  return __a;
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcltd_f64 (float64_t __a, float64_t __b)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_s8 (int8x8_t __a, int8x8_t __b, __const int __c)
- {
--  return __a < __b ? -1ll : 0ll;
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a, (uint8x8_t)
-+      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
-+#else
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
-+#endif
- }
- 
--/* vcltz - vector.  */
--
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcltz_f32 (float32x2_t __a)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_s16 (int16x4_t __a, int16x4_t __b, __const int __c)
- {
--  return (uint32x2_t) (__a < 0.0f);
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a,
-+      (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3});
-+#endif
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcltz_f64 (float64x1_t __a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_s32 (int32x2_t __a, int32x2_t __b, __const int __c)
- {
--  return (uint64x1_t) (__a < (float64x1_t) {0.0});
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a, (uint32x2_t) {2-__c, 3-__c});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {__c, __c+1});
-+#endif
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vcltz_s8 (int8x8_t __a)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_s64 (int64x1_t __a, int64x1_t __b, __const int __c)
- {
--  return (uint8x8_t) (__a < 0);
-+  __AARCH64_LANE_CHECK (__a, __c);
-+  /* The only possible index to the assembler instruction returns element 0.  */
-+  return __a;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vcltz_s16 (int16x4_t __a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_u8 (uint8x8_t __a, uint8x8_t __b, __const int __c)
- {
--  return (uint16x4_t) (__a < 0);
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a, (uint8x8_t)
-+      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
-+#else
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
-+#endif
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcltz_s32 (int32x2_t __a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_u16 (uint16x4_t __a, uint16x4_t __b, __const int __c)
- {
--  return (uint32x2_t) (__a < 0);
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a,
-+      (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3});
-+#endif
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcltz_s64 (int64x1_t __a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_u32 (uint32x2_t __a, uint32x2_t __b, __const int __c)
- {
--  return (uint64x1_t) (__a < __AARCH64_INT64_C (0));
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a, (uint32x2_t) {2-__c, 3-__c});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {__c, __c+1});
-+#endif
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcltzq_f32 (float32x4_t __a)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_u64 (uint64x1_t __a, uint64x1_t __b, __const int __c)
- {
--  return (uint32x4_t) (__a < 0.0f);
-+  __AARCH64_LANE_CHECK (__a, __c);
-+  /* The only possible index to the assembler instruction returns element 0.  */
-+  return __a;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcltzq_f64 (float64x2_t __a)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vextq_f16 (float16x8_t __a, float16x8_t __b, __const int __c)
- {
--  return (uint64x2_t) (__a < 0.0);
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a,
-+			    (uint16x8_t) {8 - __c, 9 - __c, 10 - __c, 11 - __c,
-+					  12 - __c, 13 - __c, 14 - __c,
-+					  15 - __c});
-+#else
-+  return __builtin_shuffle (__a, __b,
-+			    (uint16x8_t) {__c, __c + 1, __c + 2, __c + 3,
-+					  __c + 4, __c + 5, __c + 6, __c + 7});
-+#endif
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vcltzq_s8 (int8x16_t __a)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vextq_f32 (float32x4_t __a, float32x4_t __b, __const int __c)
- {
--  return (uint8x16_t) (__a < 0);
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a,
-+      (uint32x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {__c, __c+1, __c+2, __c+3});
-+#endif
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vcltzq_s16 (int16x8_t __a)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vextq_f64 (float64x2_t __a, float64x2_t __b, __const int __c)
- {
--  return (uint16x8_t) (__a < 0);
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {__c, __c+1});
-+#endif
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcltzq_s32 (int32x4_t __a)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vextq_p8 (poly8x16_t __a, poly8x16_t __b, __const int __c)
- {
--  return (uint32x4_t) (__a < 0);
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a, (uint8x16_t)
-+      {16-__c, 17-__c, 18-__c, 19-__c, 20-__c, 21-__c, 22-__c, 23-__c,
-+       24-__c, 25-__c, 26-__c, 27-__c, 28-__c, 29-__c, 30-__c, 31-__c});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint8x16_t)
-+      {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7,
-+       __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15});
-+#endif
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcltzq_s64 (int64x2_t __a)
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vextq_p16 (poly16x8_t __a, poly16x8_t __b, __const int __c)
- {
--  return (uint64x2_t) (__a < __AARCH64_INT64_C (0));
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a, (uint16x8_t)
-+      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
-+#else
-+  return __builtin_shuffle (__a, __b,
-+      (uint16x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
-+#endif
- }
- 
--/* vcltz - scalar.  */
--
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vcltzs_f32 (float32_t __a)
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vextq_p64 (poly64x2_t __a, poly64x2_t __b, __const int __c)
- {
--  return __a < 0.0f ? -1 : 0;
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {__c, __c+1});
-+#endif
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcltzd_s64 (int64_t __a)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vextq_s8 (int8x16_t __a, int8x16_t __b, __const int __c)
- {
--  return __a < 0 ? -1ll : 0ll;
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a, (uint8x16_t)
-+      {16-__c, 17-__c, 18-__c, 19-__c, 20-__c, 21-__c, 22-__c, 23-__c,
-+       24-__c, 25-__c, 26-__c, 27-__c, 28-__c, 29-__c, 30-__c, 31-__c});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint8x16_t)
-+      {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7,
-+       __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15});
-+#endif
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcltzd_f64 (float64_t __a)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vextq_s16 (int16x8_t __a, int16x8_t __b, __const int __c)
- {
--  return __a < 0.0 ? -1ll : 0ll;
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a, (uint16x8_t)
-+      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
-+#else
-+  return __builtin_shuffle (__a, __b,
-+      (uint16x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
-+#endif
- }
- 
--/* vcls.  */
--
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vcls_s8 (int8x8_t __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vextq_s32 (int32x4_t __a, int32x4_t __b, __const int __c)
- {
--  return __builtin_aarch64_clrsbv8qi (__a);
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a,
-+      (uint32x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {__c, __c+1, __c+2, __c+3});
-+#endif
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vcls_s16 (int16x4_t __a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vextq_s64 (int64x2_t __a, int64x2_t __b, __const int __c)
- {
--  return __builtin_aarch64_clrsbv4hi (__a);
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {__c, __c+1});
-+#endif
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vcls_s32 (int32x2_t __a)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vextq_u8 (uint8x16_t __a, uint8x16_t __b, __const int __c)
- {
--  return __builtin_aarch64_clrsbv2si (__a);
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a, (uint8x16_t)
-+      {16-__c, 17-__c, 18-__c, 19-__c, 20-__c, 21-__c, 22-__c, 23-__c,
-+       24-__c, 25-__c, 26-__c, 27-__c, 28-__c, 29-__c, 30-__c, 31-__c});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint8x16_t)
-+      {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7,
-+       __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15});
-+#endif
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vclsq_s8 (int8x16_t __a)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vextq_u16 (uint16x8_t __a, uint16x8_t __b, __const int __c)
- {
--  return __builtin_aarch64_clrsbv16qi (__a);
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a, (uint16x8_t)
-+      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
-+#else
-+  return __builtin_shuffle (__a, __b,
-+      (uint16x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
-+#endif
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vclsq_s16 (int16x8_t __a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vextq_u32 (uint32x4_t __a, uint32x4_t __b, __const int __c)
- {
--  return __builtin_aarch64_clrsbv8hi (__a);
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a,
-+      (uint32x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {__c, __c+1, __c+2, __c+3});
-+#endif
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vclsq_s32 (int32x4_t __a)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vextq_u64 (uint64x2_t __a, uint64x2_t __b, __const int __c)
- {
--  return __builtin_aarch64_clrsbv4si (__a);
-+  __AARCH64_LANE_CHECK (__a, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {__c, __c+1});
-+#endif
- }
- 
--/* vclz.  */
-+/* vfma  */
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vclz_s8 (int8x8_t __a)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfma_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
- {
--  return __builtin_aarch64_clzv8qi (__a);
-+  return (float64x1_t) {__builtin_fma (__b[0], __c[0], __a[0])};
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vclz_s16 (int16x4_t __a)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfma_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
- {
--  return __builtin_aarch64_clzv4hi (__a);
-+  return __builtin_aarch64_fmav2sf (__b, __c, __a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vclz_s32 (int32x2_t __a)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
- {
--  return __builtin_aarch64_clzv2si (__a);
-+  return __builtin_aarch64_fmav4sf (__b, __c, __a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vclz_u8 (uint8x8_t __a)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmaq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
- {
--  return (uint8x8_t)__builtin_aarch64_clzv8qi ((int8x8_t)__a);
-+  return __builtin_aarch64_fmav2df (__b, __c, __a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vclz_u16 (uint16x4_t __a)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
- {
--  return (uint16x4_t)__builtin_aarch64_clzv4hi ((int16x4_t)__a);
-+  return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vclz_u32 (uint32x2_t __a)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfma_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c)
- {
--  return (uint32x2_t)__builtin_aarch64_clzv2si ((int32x2_t)__a);
-+  return (float64x1_t) {__b[0] * __c + __a[0]};
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vclzq_s8 (int8x16_t __a)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
- {
--  return __builtin_aarch64_clzv16qi (__a);
-+  return __builtin_aarch64_fmav4sf (__b, vdupq_n_f32 (__c), __a);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vclzq_s16 (int16x8_t __a)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmaq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c)
- {
--  return __builtin_aarch64_clzv8hi (__a);
-+  return __builtin_aarch64_fmav2df (__b, vdupq_n_f64 (__c), __a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vclzq_s32 (int32x4_t __a)
-+/* vfma_lane  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfma_lane_f32 (float32x2_t __a, float32x2_t __b,
-+	       float32x2_t __c, const int __lane)
- {
--  return __builtin_aarch64_clzv4si (__a);
-+  return __builtin_aarch64_fmav2sf (__b,
-+				    __aarch64_vdup_lane_f32 (__c, __lane),
-+				    __a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vclzq_u8 (uint8x16_t __a)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfma_lane_f64 (float64x1_t __a, float64x1_t __b,
-+	       float64x1_t __c, const int __lane)
- {
--  return (uint8x16_t)__builtin_aarch64_clzv16qi ((int8x16_t)__a);
-+  return (float64x1_t) {__builtin_fma (__b[0], __c[0], __a[0])};
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vclzq_u16 (uint16x8_t __a)
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmad_lane_f64 (float64_t __a, float64_t __b,
-+	        float64x1_t __c, const int __lane)
- {
--  return (uint16x8_t)__builtin_aarch64_clzv8hi ((int16x8_t)__a);
-+  return __builtin_fma (__b, __c[0], __a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vclzq_u32 (uint32x4_t __a)
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmas_lane_f32 (float32_t __a, float32_t __b,
-+	        float32x2_t __c, const int __lane)
- {
--  return (uint32x4_t)__builtin_aarch64_clzv4si ((int32x4_t)__a);
-+  return __builtin_fmaf (__b, __aarch64_vget_lane_any (__c, __lane), __a);
- }
- 
--/* vcnt.  */
-+/* vfma_laneq  */
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vcnt_p8 (poly8x8_t __a)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfma_laneq_f32 (float32x2_t __a, float32x2_t __b,
-+	        float32x4_t __c, const int __lane)
- {
--  return (poly8x8_t) __builtin_aarch64_popcountv8qi ((int8x8_t) __a);
-+  return __builtin_aarch64_fmav2sf (__b,
-+				    __aarch64_vdup_laneq_f32 (__c, __lane),
-+				    __a);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vcnt_s8 (int8x8_t __a)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfma_laneq_f64 (float64x1_t __a, float64x1_t __b,
-+	        float64x2_t __c, const int __lane)
- {
--  return __builtin_aarch64_popcountv8qi (__a);
-+  float64_t __c0 = __aarch64_vget_lane_any (__c, __lane);
-+  return (float64x1_t) {__builtin_fma (__b[0], __c0, __a[0])};
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vcnt_u8 (uint8x8_t __a)
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmad_laneq_f64 (float64_t __a, float64_t __b,
-+	         float64x2_t __c, const int __lane)
- {
--  return (uint8x8_t) __builtin_aarch64_popcountv8qi ((int8x8_t) __a);
-+  return __builtin_fma (__b, __aarch64_vget_lane_any (__c, __lane), __a);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vcntq_p8 (poly8x16_t __a)
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmas_laneq_f32 (float32_t __a, float32_t __b,
-+		 float32x4_t __c, const int __lane)
- {
--  return (poly8x16_t) __builtin_aarch64_popcountv16qi ((int8x16_t) __a);
-+  return __builtin_fmaf (__b, __aarch64_vget_lane_any (__c, __lane), __a);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vcntq_s8 (int8x16_t __a)
-+/* vfmaq_lane  */
-+
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmaq_lane_f32 (float32x4_t __a, float32x4_t __b,
-+	        float32x2_t __c, const int __lane)
- {
--  return __builtin_aarch64_popcountv16qi (__a);
-+  return __builtin_aarch64_fmav4sf (__b,
-+				    __aarch64_vdupq_lane_f32 (__c, __lane),
-+				    __a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vcntq_u8 (uint8x16_t __a)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmaq_lane_f64 (float64x2_t __a, float64x2_t __b,
-+	        float64x1_t __c, const int __lane)
- {
--  return (uint8x16_t) __builtin_aarch64_popcountv16qi ((int8x16_t) __a);
-+  return __builtin_aarch64_fmav2df (__b, vdupq_n_f64 (__c[0]), __a);
- }
- 
--/* vcvt (double -> float).  */
-+/* vfmaq_laneq  */
- 
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
--vcvt_f16_f32 (float32x4_t __a)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmaq_laneq_f32 (float32x4_t __a, float32x4_t __b,
-+	         float32x4_t __c, const int __lane)
- {
--  return __builtin_aarch64_float_truncate_lo_v4hf (__a);
-+  return __builtin_aarch64_fmav4sf (__b,
-+				    __aarch64_vdupq_laneq_f32 (__c, __lane),
-+				    __a);
- }
- 
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
--vcvt_high_f16_f32 (float16x4_t __a, float32x4_t __b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmaq_laneq_f64 (float64x2_t __a, float64x2_t __b,
-+	         float64x2_t __c, const int __lane)
- {
--  return __builtin_aarch64_float_truncate_hi_v8hf (__a, __b);
-+  return __builtin_aarch64_fmav2df (__b,
-+				    __aarch64_vdupq_laneq_f64 (__c, __lane),
-+				    __a);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vcvt_f32_f64 (float64x2_t __a)
-+/* vfms  */
-+
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfms_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
- {
--  return __builtin_aarch64_float_truncate_lo_v2sf (__a);
-+  return (float64x1_t) {__builtin_fma (-__b[0], __c[0], __a[0])};
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vcvt_high_f32_f64 (float32x2_t __a, float64x2_t __b)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfms_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
- {
--  return __builtin_aarch64_float_truncate_hi_v4sf (__a, __b);
-+  return __builtin_aarch64_fmav2sf (-__b, __c, __a);
- }
- 
--/* vcvt (float -> double).  */
--
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vcvt_f32_f16 (float16x4_t __a)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
- {
--  return __builtin_aarch64_float_extend_lo_v4sf (__a);
-+  return __builtin_aarch64_fmav4sf (-__b, __c, __a);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vcvt_f64_f32 (float32x2_t __a)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
- {
--
--  return __builtin_aarch64_float_extend_lo_v2df (__a);
-+  return __builtin_aarch64_fmav2df (-__b, __c, __a);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vcvt_high_f32_f16 (float16x8_t __a)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfms_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
- {
--  return __builtin_aarch64_vec_unpacks_hi_v8hf (__a);
-+  return __builtin_aarch64_fmav2sf (-__b, vdup_n_f32 (__c), __a);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vcvt_high_f64_f32 (float32x4_t __a)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfms_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c)
- {
--  return __builtin_aarch64_vec_unpacks_hi_v4sf (__a);
-+  return (float64x1_t) {-__b[0] * __c + __a[0]};
- }
- 
--/* vcvt  (<u>int -> float)  */
--
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vcvtd_f64_s64 (int64_t __a)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
- {
--  return (float64_t) __a;
-+  return __builtin_aarch64_fmav4sf (-__b, vdupq_n_f32 (__c), __a);
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vcvtd_f64_u64 (uint64_t __a)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmsq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c)
- {
--  return (float64_t) __a;
-+  return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c), __a);
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vcvts_f32_s32 (int32_t __a)
-+/* vfms_lane  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfms_lane_f32 (float32x2_t __a, float32x2_t __b,
-+	       float32x2_t __c, const int __lane)
- {
--  return (float32_t) __a;
-+  return __builtin_aarch64_fmav2sf (-__b,
-+				    __aarch64_vdup_lane_f32 (__c, __lane),
-+				    __a);
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vcvts_f32_u32 (uint32_t __a)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfms_lane_f64 (float64x1_t __a, float64x1_t __b,
-+	       float64x1_t __c, const int __lane)
- {
--  return (float32_t) __a;
-+  return (float64x1_t) {__builtin_fma (-__b[0], __c[0], __a[0])};
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vcvt_f32_s32 (int32x2_t __a)
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmsd_lane_f64 (float64_t __a, float64_t __b,
-+	        float64x1_t __c, const int __lane)
- {
--  return __builtin_aarch64_floatv2siv2sf (__a);
-+  return __builtin_fma (-__b, __c[0], __a);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vcvt_f32_u32 (uint32x2_t __a)
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmss_lane_f32 (float32_t __a, float32_t __b,
-+	        float32x2_t __c, const int __lane)
- {
--  return __builtin_aarch64_floatunsv2siv2sf ((int32x2_t) __a);
-+  return __builtin_fmaf (-__b, __aarch64_vget_lane_any (__c, __lane), __a);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vcvtq_f32_s32 (int32x4_t __a)
-+/* vfms_laneq  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfms_laneq_f32 (float32x2_t __a, float32x2_t __b,
-+	        float32x4_t __c, const int __lane)
- {
--  return __builtin_aarch64_floatv4siv4sf (__a);
-+  return __builtin_aarch64_fmav2sf (-__b,
-+				    __aarch64_vdup_laneq_f32 (__c, __lane),
-+				    __a);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vcvtq_f32_u32 (uint32x4_t __a)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfms_laneq_f64 (float64x1_t __a, float64x1_t __b,
-+	        float64x2_t __c, const int __lane)
- {
--  return __builtin_aarch64_floatunsv4siv4sf ((int32x4_t) __a);
-+  float64_t __c0 = __aarch64_vget_lane_any (__c, __lane);
-+  return (float64x1_t) {__builtin_fma (-__b[0], __c0, __a[0])};
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vcvtq_f64_s64 (int64x2_t __a)
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmsd_laneq_f64 (float64_t __a, float64_t __b,
-+	         float64x2_t __c, const int __lane)
- {
--  return __builtin_aarch64_floatv2div2df (__a);
-+  return __builtin_fma (-__b, __aarch64_vget_lane_any (__c, __lane), __a);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vcvtq_f64_u64 (uint64x2_t __a)
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmss_laneq_f32 (float32_t __a, float32_t __b,
-+		 float32x4_t __c, const int __lane)
- {
--  return __builtin_aarch64_floatunsv2div2df ((int64x2_t) __a);
-+  return __builtin_fmaf (-__b, __aarch64_vget_lane_any (__c, __lane), __a);
- }
- 
--/* vcvt (float -> <u>int)  */
-+/* vfmsq_lane  */
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vcvtd_s64_f64 (float64_t __a)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmsq_lane_f32 (float32x4_t __a, float32x4_t __b,
-+	        float32x2_t __c, const int __lane)
- {
--  return (int64_t) __a;
-+  return __builtin_aarch64_fmav4sf (-__b,
-+				    __aarch64_vdupq_lane_f32 (__c, __lane),
-+				    __a);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcvtd_u64_f64 (float64_t __a)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmsq_lane_f64 (float64x2_t __a, float64x2_t __b,
-+	        float64x1_t __c, const int __lane)
- {
--  return (uint64_t) __a;
-+  return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c[0]), __a);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vcvts_s32_f32 (float32_t __a)
-+/* vfmsq_laneq  */
-+
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmsq_laneq_f32 (float32x4_t __a, float32x4_t __b,
-+	         float32x4_t __c, const int __lane)
- {
--  return (int32_t) __a;
-+  return __builtin_aarch64_fmav4sf (-__b,
-+				    __aarch64_vdupq_laneq_f32 (__c, __lane),
-+				    __a);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vcvts_u32_f32 (float32_t __a)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmsq_laneq_f64 (float64x2_t __a, float64x2_t __b,
-+	         float64x2_t __c, const int __lane)
- {
--  return (uint32_t) __a;
-+  return __builtin_aarch64_fmav2df (-__b,
-+				    __aarch64_vdupq_laneq_f64 (__c, __lane),
-+				    __a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vcvt_s32_f32 (float32x2_t __a)
-+/* vld1 */
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_f16 (const float16_t *__a)
- {
--  return __builtin_aarch64_lbtruncv2sfv2si (__a);
-+  return __builtin_aarch64_ld1v4hf (__a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcvt_u32_f32 (float32x2_t __a)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_f32 (const float32_t *a)
- {
--  return __builtin_aarch64_lbtruncuv2sfv2si_us (__a);
-+  return __builtin_aarch64_ld1v2sf ((const __builtin_aarch64_simd_sf *) a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vcvtq_s32_f32 (float32x4_t __a)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_f64 (const float64_t *a)
- {
--  return __builtin_aarch64_lbtruncv4sfv4si (__a);
-+  return (float64x1_t) {*a};
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcvtq_u32_f32 (float32x4_t __a)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_p8 (const poly8_t *a)
- {
--  return __builtin_aarch64_lbtruncuv4sfv4si_us (__a);
-+  return (poly8x8_t)
-+    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vcvt_s64_f64 (float64x1_t __a)
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_p16 (const poly16_t *a)
- {
--  return (int64x1_t) {vcvtd_s64_f64 (__a[0])};
-+  return (poly16x4_t)
-+    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcvt_u64_f64 (float64x1_t __a)
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_p64 (const poly64_t *a)
- {
--  return (uint64x1_t) {vcvtd_u64_f64 (__a[0])};
-+  return (poly64x1_t) {*a};
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vcvtq_s64_f64 (float64x2_t __a)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_s8 (const int8_t *a)
- {
--  return __builtin_aarch64_lbtruncv2dfv2di (__a);
-+  return __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcvtq_u64_f64 (float64x2_t __a)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_s16 (const int16_t *a)
- {
--  return __builtin_aarch64_lbtruncuv2dfv2di_us (__a);
-+  return __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
- }
- 
--/* vcvta  */
--
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vcvtad_s64_f64 (float64_t __a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_s32 (const int32_t *a)
- {
--  return __builtin_aarch64_lrounddfdi (__a);
-+  return __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) a);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcvtad_u64_f64 (float64_t __a)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_s64 (const int64_t *a)
- {
--  return __builtin_aarch64_lroundudfdi_us (__a);
-+  return (int64x1_t) {*a};
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vcvtas_s32_f32 (float32_t __a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_u8 (const uint8_t *a)
- {
--  return __builtin_aarch64_lroundsfsi (__a);
-+  return (uint8x8_t)
-+    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vcvtas_u32_f32 (float32_t __a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_u16 (const uint16_t *a)
- {
--  return __builtin_aarch64_lroundusfsi_us (__a);
-+  return (uint16x4_t)
-+    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vcvta_s32_f32 (float32x2_t __a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_u32 (const uint32_t *a)
- {
--  return __builtin_aarch64_lroundv2sfv2si (__a);
-+  return (uint32x2_t)
-+    __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcvta_u32_f32 (float32x2_t __a)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_u64 (const uint64_t *a)
- {
--  return __builtin_aarch64_lrounduv2sfv2si_us (__a);
-+  return (uint64x1_t) {*a};
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vcvtaq_s32_f32 (float32x4_t __a)
-+/* vld1q */
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_f16 (const float16_t *__a)
- {
--  return __builtin_aarch64_lroundv4sfv4si (__a);
-+  return __builtin_aarch64_ld1v8hf (__a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcvtaq_u32_f32 (float32x4_t __a)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_f32 (const float32_t *a)
- {
--  return __builtin_aarch64_lrounduv4sfv4si_us (__a);
-+  return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) a);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vcvta_s64_f64 (float64x1_t __a)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_f64 (const float64_t *a)
- {
--  return (int64x1_t) {vcvtad_s64_f64 (__a[0])};
-+  return __builtin_aarch64_ld1v2df ((const __builtin_aarch64_simd_df *) a);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcvta_u64_f64 (float64x1_t __a)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_p8 (const poly8_t *a)
- {
--  return (uint64x1_t) {vcvtad_u64_f64 (__a[0])};
-+  return (poly8x16_t)
-+    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vcvtaq_s64_f64 (float64x2_t __a)
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_p16 (const poly16_t *a)
- {
--  return __builtin_aarch64_lroundv2dfv2di (__a);
-+  return (poly16x8_t)
-+    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcvtaq_u64_f64 (float64x2_t __a)
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_p64 (const poly64_t *a)
- {
--  return __builtin_aarch64_lrounduv2dfv2di_us (__a);
-+  return (poly64x2_t)
-+    __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
- }
- 
--/* vcvtm  */
--
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vcvtmd_s64_f64 (float64_t __a)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_s8 (const int8_t *a)
- {
--  return __builtin_llfloor (__a);
-+  return __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcvtmd_u64_f64 (float64_t __a)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_s16 (const int16_t *a)
- {
--  return __builtin_aarch64_lfloorudfdi_us (__a);
-+  return __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vcvtms_s32_f32 (float32_t __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_s32 (const int32_t *a)
- {
--  return __builtin_ifloorf (__a);
-+  return __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) a);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vcvtms_u32_f32 (float32_t __a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_s64 (const int64_t *a)
- {
--  return __builtin_aarch64_lfloorusfsi_us (__a);
-+  return __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vcvtm_s32_f32 (float32x2_t __a)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_u8 (const uint8_t *a)
- {
--  return __builtin_aarch64_lfloorv2sfv2si (__a);
-+  return (uint8x16_t)
-+    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcvtm_u32_f32 (float32x2_t __a)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_u16 (const uint16_t *a)
- {
--  return __builtin_aarch64_lflooruv2sfv2si_us (__a);
-+  return (uint16x8_t)
-+    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vcvtmq_s32_f32 (float32x4_t __a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_u32 (const uint32_t *a)
- {
--  return __builtin_aarch64_lfloorv4sfv4si (__a);
-+  return (uint32x4_t)
-+    __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcvtmq_u32_f32 (float32x4_t __a)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_u64 (const uint64_t *a)
- {
--  return __builtin_aarch64_lflooruv4sfv4si_us (__a);
-+  return (uint64x2_t)
-+    __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vcvtm_s64_f64 (float64x1_t __a)
-+/* vld1_dup  */
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_dup_f16 (const float16_t* __a)
- {
--  return (int64x1_t) {vcvtmd_s64_f64 (__a[0])};
-+  return vdup_n_f16 (*__a);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcvtm_u64_f64 (float64x1_t __a)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_dup_f32 (const float32_t* __a)
- {
--  return (uint64x1_t) {vcvtmd_u64_f64 (__a[0])};
-+  return vdup_n_f32 (*__a);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vcvtmq_s64_f64 (float64x2_t __a)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_dup_f64 (const float64_t* __a)
- {
--  return __builtin_aarch64_lfloorv2dfv2di (__a);
-+  return vdup_n_f64 (*__a);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcvtmq_u64_f64 (float64x2_t __a)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_dup_p8 (const poly8_t* __a)
- {
--  return __builtin_aarch64_lflooruv2dfv2di_us (__a);
-+  return vdup_n_p8 (*__a);
- }
- 
--/* vcvtn  */
--
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vcvtnd_s64_f64 (float64_t __a)
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_dup_p16 (const poly16_t* __a)
- {
--  return __builtin_aarch64_lfrintndfdi (__a);
-+  return vdup_n_p16 (*__a);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcvtnd_u64_f64 (float64_t __a)
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_dup_p64 (const poly64_t* __a)
- {
--  return __builtin_aarch64_lfrintnudfdi_us (__a);
-+  return vdup_n_p64 (*__a);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vcvtns_s32_f32 (float32_t __a)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_dup_s8 (const int8_t* __a)
- {
--  return __builtin_aarch64_lfrintnsfsi (__a);
-+  return vdup_n_s8 (*__a);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vcvtns_u32_f32 (float32_t __a)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_dup_s16 (const int16_t* __a)
- {
--  return __builtin_aarch64_lfrintnusfsi_us (__a);
-+  return vdup_n_s16 (*__a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vcvtn_s32_f32 (float32x2_t __a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_dup_s32 (const int32_t* __a)
- {
--  return __builtin_aarch64_lfrintnv2sfv2si (__a);
-+  return vdup_n_s32 (*__a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcvtn_u32_f32 (float32x2_t __a)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_dup_s64 (const int64_t* __a)
- {
--  return __builtin_aarch64_lfrintnuv2sfv2si_us (__a);
-+  return vdup_n_s64 (*__a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vcvtnq_s32_f32 (float32x4_t __a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_dup_u8 (const uint8_t* __a)
- {
--  return __builtin_aarch64_lfrintnv4sfv4si (__a);
-+  return vdup_n_u8 (*__a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcvtnq_u32_f32 (float32x4_t __a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_dup_u16 (const uint16_t* __a)
- {
--  return __builtin_aarch64_lfrintnuv4sfv4si_us (__a);
-+  return vdup_n_u16 (*__a);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vcvtn_s64_f64 (float64x1_t __a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_dup_u32 (const uint32_t* __a)
- {
--  return (int64x1_t) {vcvtnd_s64_f64 (__a[0])};
-+  return vdup_n_u32 (*__a);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcvtn_u64_f64 (float64x1_t __a)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_dup_u64 (const uint64_t* __a)
- {
--  return (uint64x1_t) {vcvtnd_u64_f64 (__a[0])};
-+  return vdup_n_u64 (*__a);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vcvtnq_s64_f64 (float64x2_t __a)
-+/* vld1q_dup  */
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_dup_f16 (const float16_t* __a)
- {
--  return __builtin_aarch64_lfrintnv2dfv2di (__a);
-+  return vdupq_n_f16 (*__a);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcvtnq_u64_f64 (float64x2_t __a)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_dup_f32 (const float32_t* __a)
- {
--  return __builtin_aarch64_lfrintnuv2dfv2di_us (__a);
-+  return vdupq_n_f32 (*__a);
- }
- 
--/* vcvtp  */
--
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vcvtpd_s64_f64 (float64_t __a)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_dup_f64 (const float64_t* __a)
- {
--  return __builtin_llceil (__a);
-+  return vdupq_n_f64 (*__a);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vcvtpd_u64_f64 (float64_t __a)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_dup_p8 (const poly8_t* __a)
- {
--  return __builtin_aarch64_lceiludfdi_us (__a);
-+  return vdupq_n_p8 (*__a);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vcvtps_s32_f32 (float32_t __a)
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_dup_p16 (const poly16_t* __a)
- {
--  return __builtin_iceilf (__a);
-+  return vdupq_n_p16 (*__a);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vcvtps_u32_f32 (float32_t __a)
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_dup_p64 (const poly64_t* __a)
- {
--  return __builtin_aarch64_lceilusfsi_us (__a);
-+  return vdupq_n_p64 (*__a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vcvtp_s32_f32 (float32x2_t __a)
-+ __extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_dup_s8 (const int8_t* __a)
- {
--  return __builtin_aarch64_lceilv2sfv2si (__a);
-+  return vdupq_n_s8 (*__a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vcvtp_u32_f32 (float32x2_t __a)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_dup_s16 (const int16_t* __a)
- {
--  return __builtin_aarch64_lceiluv2sfv2si_us (__a);
-+  return vdupq_n_s16 (*__a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vcvtpq_s32_f32 (float32x4_t __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_dup_s32 (const int32_t* __a)
- {
--  return __builtin_aarch64_lceilv4sfv4si (__a);
-+  return vdupq_n_s32 (*__a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vcvtpq_u32_f32 (float32x4_t __a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_dup_s64 (const int64_t* __a)
- {
--  return __builtin_aarch64_lceiluv4sfv4si_us (__a);
-+  return vdupq_n_s64 (*__a);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vcvtp_s64_f64 (float64x1_t __a)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_dup_u8 (const uint8_t* __a)
- {
--  return (int64x1_t) {vcvtpd_s64_f64 (__a[0])};
-+  return vdupq_n_u8 (*__a);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vcvtp_u64_f64 (float64x1_t __a)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_dup_u16 (const uint16_t* __a)
- {
--  return (uint64x1_t) {vcvtpd_u64_f64 (__a[0])};
-+  return vdupq_n_u16 (*__a);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vcvtpq_s64_f64 (float64x2_t __a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_dup_u32 (const uint32_t* __a)
- {
--  return __builtin_aarch64_lceilv2dfv2di (__a);
-+  return vdupq_n_u32 (*__a);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vcvtpq_u64_f64 (float64x2_t __a)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_dup_u64 (const uint64_t* __a)
- {
--  return __builtin_aarch64_lceiluv2dfv2di_us (__a);
-+  return vdupq_n_u64 (*__a);
- }
- 
--/* vdup_n  */
-+/* vld1_lane  */
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vdup_n_f32 (float32_t __a)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_lane_f16 (const float16_t *__src, float16x4_t __vec, const int __lane)
- {
--  return (float32x2_t) {__a, __a};
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vdup_n_f64 (float64_t __a)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_lane_f32 (const float32_t *__src, float32x2_t __vec, const int __lane)
- {
--  return (float64x1_t) {__a};
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vdup_n_p8 (poly8_t __a)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_lane_f64 (const float64_t *__src, float64x1_t __vec, const int __lane)
- {
--  return (poly8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
--vdup_n_p16 (poly16_t __a)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_lane_p8 (const poly8_t *__src, poly8x8_t __vec, const int __lane)
- {
--  return (poly16x4_t) {__a, __a, __a, __a};
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vdup_n_s8 (int8_t __a)
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_lane_p16 (const poly16_t *__src, poly16x4_t __vec, const int __lane)
- {
--  return (int8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vdup_n_s16 (int16_t __a)
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_lane_p64 (const poly64_t *__src, poly64x1_t __vec, const int __lane)
- {
--  return (int16x4_t) {__a, __a, __a, __a};
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vdup_n_s32 (int32_t __a)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_lane_s8 (const int8_t *__src, int8x8_t __vec, const int __lane)
- {
--  return (int32x2_t) {__a, __a};
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vdup_n_s64 (int64_t __a)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_lane_s16 (const int16_t *__src, int16x4_t __vec, const int __lane)
- {
--  return (int64x1_t) {__a};
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vdup_n_u8 (uint8_t __a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_lane_s32 (const int32_t *__src, int32x2_t __vec, const int __lane)
- {
--  return (uint8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vdup_n_u16 (uint16_t __a)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_lane_s64 (const int64_t *__src, int64x1_t __vec, const int __lane)
- {
--  return (uint16x4_t) {__a, __a, __a, __a};
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vdup_n_u32 (uint32_t __a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_lane_u8 (const uint8_t *__src, uint8x8_t __vec, const int __lane)
- {
--  return (uint32x2_t) {__a, __a};
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vdup_n_u64 (uint64_t __a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_lane_u16 (const uint16_t *__src, uint16x4_t __vec, const int __lane)
- {
--  return (uint64x1_t) {__a};
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--/* vdupq_n  */
--
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vdupq_n_f32 (float32_t __a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_lane_u32 (const uint32_t *__src, uint32x2_t __vec, const int __lane)
- {
--  return (float32x4_t) {__a, __a, __a, __a};
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vdupq_n_f64 (float64_t __a)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1_lane_u64 (const uint64_t *__src, uint64x1_t __vec, const int __lane)
- {
--  return (float64x2_t) {__a, __a};
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vdupq_n_p8 (uint32_t __a)
-+/* vld1q_lane  */
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_lane_f16 (const float16_t *__src, float16x8_t __vec, const int __lane)
- {
--  return (poly8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
--		       __a, __a, __a, __a, __a, __a, __a, __a};
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vdupq_n_p16 (uint32_t __a)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_lane_f32 (const float32_t *__src, float32x4_t __vec, const int __lane)
- {
--  return (poly16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vdupq_n_s8 (int32_t __a)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_lane_f64 (const float64_t *__src, float64x2_t __vec, const int __lane)
- {
--  return (int8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
--		      __a, __a, __a, __a, __a, __a, __a, __a};
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vdupq_n_s16 (int32_t __a)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_lane_p8 (const poly8_t *__src, poly8x16_t __vec, const int __lane)
- {
--  return (int16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vdupq_n_s32 (int32_t __a)
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_lane_p16 (const poly16_t *__src, poly16x8_t __vec, const int __lane)
- {
--  return (int32x4_t) {__a, __a, __a, __a};
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vdupq_n_s64 (int64_t __a)
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_lane_p64 (const poly64_t *__src, poly64x2_t __vec, const int __lane)
- {
--  return (int64x2_t) {__a, __a};
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vdupq_n_u8 (uint32_t __a)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_lane_s8 (const int8_t *__src, int8x16_t __vec, const int __lane)
- {
--  return (uint8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
--		       __a, __a, __a, __a, __a, __a, __a, __a};
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vdupq_n_u16 (uint32_t __a)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_lane_s16 (const int16_t *__src, int16x8_t __vec, const int __lane)
- {
--  return (uint16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vdupq_n_u32 (uint32_t __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_lane_s32 (const int32_t *__src, int32x4_t __vec, const int __lane)
- {
--  return (uint32x4_t) {__a, __a, __a, __a};
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vdupq_n_u64 (uint64_t __a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_lane_s64 (const int64_t *__src, int64x2_t __vec, const int __lane)
- {
--  return (uint64x2_t) {__a, __a};
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--/* vdup_lane  */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vdup_lane_f32 (float32x2_t __a, const int __b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_lane_u8 (const uint8_t *__src, uint8x16_t __vec, const int __lane)
- {
--  return __aarch64_vdup_lane_f32 (__a, __b);
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vdup_lane_f64 (float64x1_t __a, const int __b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_lane_u16 (const uint16_t *__src, uint16x8_t __vec, const int __lane)
- {
--  return __aarch64_vdup_lane_f64 (__a, __b);
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vdup_lane_p8 (poly8x8_t __a, const int __b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_lane_u32 (const uint32_t *__src, uint32x4_t __vec, const int __lane)
- {
--  return __aarch64_vdup_lane_p8 (__a, __b);
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
--vdup_lane_p16 (poly16x4_t __a, const int __b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld1q_lane_u64 (const uint64_t *__src, uint64x2_t __vec, const int __lane)
- {
--  return __aarch64_vdup_lane_p16 (__a, __b);
-+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vdup_lane_s8 (int8x8_t __a, const int __b)
-+/* vldn */
-+
-+__extension__ extern __inline int64x1x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_s64 (const int64_t * __a)
- {
--  return __aarch64_vdup_lane_s8 (__a, __b);
-+  int64x1x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 0);
-+  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vdup_lane_s16 (int16x4_t __a, const int __b)
-+__extension__ extern __inline uint64x1x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_u64 (const uint64_t * __a)
- {
--  return __aarch64_vdup_lane_s16 (__a, __b);
-+  uint64x1x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 0);
-+  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vdup_lane_s32 (int32x2_t __a, const int __b)
-+__extension__ extern __inline float64x1x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_f64 (const float64_t * __a)
- {
--  return __aarch64_vdup_lane_s32 (__a, __b);
-+  float64x1x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2df ((const __builtin_aarch64_simd_df *) __a);
-+  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 0)};
-+  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 1)};
-+  return ret;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vdup_lane_s64 (int64x1_t __a, const int __b)
-+__extension__ extern __inline int8x8x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_s8 (const int8_t * __a)
- {
--  return __aarch64_vdup_lane_s64 (__a, __b);
-+  int8x8x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2v8qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
-+  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vdup_lane_u8 (uint8x8_t __a, const int __b)
-+__extension__ extern __inline poly8x8x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_p8 (const poly8_t * __a)
- {
--  return __aarch64_vdup_lane_u8 (__a, __b);
-+  poly8x8x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2v8qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
-+  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vdup_lane_u16 (uint16x4_t __a, const int __b)
-+__extension__ extern __inline poly64x1x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_p64 (const poly64_t * __a)
- {
--  return __aarch64_vdup_lane_u16 (__a, __b);
-+  poly64x1x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregoidi_pss (__o, 0);
-+  ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregoidi_pss (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vdup_lane_u32 (uint32x2_t __a, const int __b)
-+__extension__ extern __inline int16x4x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_s16 (const int16_t * __a)
- {
--  return __aarch64_vdup_lane_u32 (__a, __b);
-+  int16x4x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
-+  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vdup_lane_u64 (uint64x1_t __a, const int __b)
-+__extension__ extern __inline poly16x4x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_p16 (const poly16_t * __a)
- {
--  return __aarch64_vdup_lane_u64 (__a, __b);
-+  poly16x4x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
-+  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
-+  return ret;
- }
- 
--/* vdup_laneq  */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vdup_laneq_f32 (float32x4_t __a, const int __b)
-+__extension__ extern __inline int32x2x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_s32 (const int32_t * __a)
- {
--  return __aarch64_vdup_laneq_f32 (__a, __b);
-+  int32x2x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a);
-+  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
-+  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vdup_laneq_f64 (float64x2_t __a, const int __b)
-+__extension__ extern __inline uint8x8x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_u8 (const uint8_t * __a)
- {
--  return __aarch64_vdup_laneq_f64 (__a, __b);
-+  uint8x8x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2v8qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
-+  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vdup_laneq_p8 (poly8x16_t __a, const int __b)
-+__extension__ extern __inline uint16x4x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_u16 (const uint16_t * __a)
- {
--  return __aarch64_vdup_laneq_p8 (__a, __b);
-+  uint16x4x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
-+  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
--vdup_laneq_p16 (poly16x8_t __a, const int __b)
-+__extension__ extern __inline uint32x2x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_u32 (const uint32_t * __a)
- {
--  return __aarch64_vdup_laneq_p16 (__a, __b);
-+  uint32x2x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a);
-+  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
-+  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vdup_laneq_s8 (int8x16_t __a, const int __b)
-+__extension__ extern __inline float16x4x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_f16 (const float16_t * __a)
- {
--  return __aarch64_vdup_laneq_s8 (__a, __b);
-+  float16x4x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2v4hf (__a);
-+  ret.val[0] = __builtin_aarch64_get_dregoiv4hf (__o, 0);
-+  ret.val[1] = __builtin_aarch64_get_dregoiv4hf (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vdup_laneq_s16 (int16x8_t __a, const int __b)
-+__extension__ extern __inline float32x2x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_f32 (const float32_t * __a)
- {
--  return __aarch64_vdup_laneq_s16 (__a, __b);
-+  float32x2x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2v2sf ((const __builtin_aarch64_simd_sf *) __a);
-+  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 0);
-+  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vdup_laneq_s32 (int32x4_t __a, const int __b)
-+__extension__ extern __inline int8x16x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_s8 (const int8_t * __a)
- {
--  return __aarch64_vdup_laneq_s32 (__a, __b);
-+  int8x16x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
-+  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vdup_laneq_s64 (int64x2_t __a, const int __b)
-+__extension__ extern __inline poly8x16x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_p8 (const poly8_t * __a)
- {
--  return __aarch64_vdup_laneq_s64 (__a, __b);
-+  poly8x16x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
-+  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vdup_laneq_u8 (uint8x16_t __a, const int __b)
-+__extension__ extern __inline int16x8x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_s16 (const int16_t * __a)
- {
--  return __aarch64_vdup_laneq_u8 (__a, __b);
-+  int16x8x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
-+  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vdup_laneq_u16 (uint16x8_t __a, const int __b)
-+__extension__ extern __inline poly16x8x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_p16 (const poly16_t * __a)
- {
--  return __aarch64_vdup_laneq_u16 (__a, __b);
-+  poly16x8x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
-+  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vdup_laneq_u32 (uint32x4_t __a, const int __b)
-+__extension__ extern __inline poly64x2x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_p64 (const poly64_t * __a)
- {
--  return __aarch64_vdup_laneq_u32 (__a, __b);
-+  poly64x2x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di_pss (__o, 0);
-+  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di_pss (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vdup_laneq_u64 (uint64x2_t __a, const int __b)
-+__extension__ extern __inline int32x4x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_s32 (const int32_t * __a)
- {
--  return __aarch64_vdup_laneq_u64 (__a, __b);
-+  int32x4x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
-+  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
-+  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
-+  return ret;
- }
- 
--/* vdupq_lane  */
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vdupq_lane_f32 (float32x2_t __a, const int __b)
-+__extension__ extern __inline int64x2x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_s64 (const int64_t * __a)
- {
--  return __aarch64_vdupq_lane_f32 (__a, __b);
-+  int64x2x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
-+  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vdupq_lane_f64 (float64x1_t __a, const int __b)
-+__extension__ extern __inline uint8x16x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_u8 (const uint8_t * __a)
- {
--  return __aarch64_vdupq_lane_f64 (__a, __b);
-+  uint8x16x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
-+  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vdupq_lane_p8 (poly8x8_t __a, const int __b)
-+__extension__ extern __inline uint16x8x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_u16 (const uint16_t * __a)
- {
--  return __aarch64_vdupq_lane_p8 (__a, __b);
-+  uint16x8x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
-+  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vdupq_lane_p16 (poly16x4_t __a, const int __b)
-+__extension__ extern __inline uint32x4x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_u32 (const uint32_t * __a)
- {
--  return __aarch64_vdupq_lane_p16 (__a, __b);
-+  uint32x4x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
-+  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
-+  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vdupq_lane_s8 (int8x8_t __a, const int __b)
-+__extension__ extern __inline uint64x2x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_u64 (const uint64_t * __a)
- {
--  return __aarch64_vdupq_lane_s8 (__a, __b);
-+  uint64x2x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
-+  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vdupq_lane_s16 (int16x4_t __a, const int __b)
-+__extension__ extern __inline float16x8x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_f16 (const float16_t * __a)
- {
--  return __aarch64_vdupq_lane_s16 (__a, __b);
-+  float16x8x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2v8hf (__a);
-+  ret.val[0] = __builtin_aarch64_get_qregoiv8hf (__o, 0);
-+  ret.val[1] = __builtin_aarch64_get_qregoiv8hf (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vdupq_lane_s32 (int32x2_t __a, const int __b)
-+__extension__ extern __inline float32x4x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_f32 (const float32_t * __a)
- {
--  return __aarch64_vdupq_lane_s32 (__a, __b);
-+  float32x4x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2v4sf ((const __builtin_aarch64_simd_sf *) __a);
-+  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 0);
-+  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vdupq_lane_s64 (int64x1_t __a, const int __b)
-+__extension__ extern __inline float64x2x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_f64 (const float64_t * __a)
- {
--  return __aarch64_vdupq_lane_s64 (__a, __b);
-+  float64x2x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2v2df ((const __builtin_aarch64_simd_df *) __a);
-+  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 0);
-+  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vdupq_lane_u8 (uint8x8_t __a, const int __b)
-+__extension__ extern __inline int64x1x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_s64 (const int64_t * __a)
- {
--  return __aarch64_vdupq_lane_u8 (__a, __b);
-+  int64x1x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
-+  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
-+  ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vdupq_lane_u16 (uint16x4_t __a, const int __b)
-+__extension__ extern __inline uint64x1x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_u64 (const uint64_t * __a)
- {
--  return __aarch64_vdupq_lane_u16 (__a, __b);
-+  uint64x1x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
-+  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
-+  ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vdupq_lane_u32 (uint32x2_t __a, const int __b)
-+__extension__ extern __inline float64x1x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_f64 (const float64_t * __a)
- {
--  return __aarch64_vdupq_lane_u32 (__a, __b);
-+  float64x1x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3df ((const __builtin_aarch64_simd_df *) __a);
-+  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 0)};
-+  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 1)};
-+  ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 2)};
-+  return ret;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vdupq_lane_u64 (uint64x1_t __a, const int __b)
-+__extension__ extern __inline int8x8x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_s8 (const int8_t * __a)
- {
--  return __aarch64_vdupq_lane_u64 (__a, __b);
-+  int8x8x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3v8qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
-+  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
-+  ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
-+  return ret;
- }
- 
--/* vdupq_laneq  */
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vdupq_laneq_f32 (float32x4_t __a, const int __b)
-+__extension__ extern __inline poly8x8x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_p8 (const poly8_t * __a)
- {
--  return __aarch64_vdupq_laneq_f32 (__a, __b);
-+  poly8x8x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3v8qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
-+  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
-+  ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vdupq_laneq_f64 (float64x2_t __a, const int __b)
-+__extension__ extern __inline int16x4x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_s16 (const int16_t * __a)
- {
--  return __aarch64_vdupq_laneq_f64 (__a, __b);
-+  int16x4x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3v4hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
-+  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
-+  ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vdupq_laneq_p8 (poly8x16_t __a, const int __b)
-+__extension__ extern __inline poly16x4x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_p16 (const poly16_t * __a)
- {
--  return __aarch64_vdupq_laneq_p8 (__a, __b);
-+  poly16x4x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3v4hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
-+  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
-+  ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vdupq_laneq_p16 (poly16x8_t __a, const int __b)
-+__extension__ extern __inline int32x2x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_s32 (const int32_t * __a)
- {
--  return __aarch64_vdupq_laneq_p16 (__a, __b);
-+  int32x2x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3v2si ((const __builtin_aarch64_simd_si *) __a);
-+  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0);
-+  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1);
-+  ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vdupq_laneq_s8 (int8x16_t __a, const int __b)
-+__extension__ extern __inline uint8x8x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_u8 (const uint8_t * __a)
- {
--  return __aarch64_vdupq_laneq_s8 (__a, __b);
-+  uint8x8x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3v8qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
-+  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
-+  ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vdupq_laneq_s16 (int16x8_t __a, const int __b)
-+__extension__ extern __inline uint16x4x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_u16 (const uint16_t * __a)
- {
--  return __aarch64_vdupq_laneq_s16 (__a, __b);
-+  uint16x4x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3v4hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
-+  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
-+  ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vdupq_laneq_s32 (int32x4_t __a, const int __b)
-+__extension__ extern __inline uint32x2x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_u32 (const uint32_t * __a)
- {
--  return __aarch64_vdupq_laneq_s32 (__a, __b);
-+  uint32x2x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3v2si ((const __builtin_aarch64_simd_si *) __a);
-+  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0);
-+  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1);
-+  ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vdupq_laneq_s64 (int64x2_t __a, const int __b)
-+__extension__ extern __inline float16x4x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_f16 (const float16_t * __a)
- {
--  return __aarch64_vdupq_laneq_s64 (__a, __b);
-+  float16x4x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3v4hf (__a);
-+  ret.val[0] = __builtin_aarch64_get_dregciv4hf (__o, 0);
-+  ret.val[1] = __builtin_aarch64_get_dregciv4hf (__o, 1);
-+  ret.val[2] = __builtin_aarch64_get_dregciv4hf (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vdupq_laneq_u8 (uint8x16_t __a, const int __b)
-+__extension__ extern __inline float32x2x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_f32 (const float32_t * __a)
- {
--  return __aarch64_vdupq_laneq_u8 (__a, __b);
-+  float32x2x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3v2sf ((const __builtin_aarch64_simd_sf *) __a);
-+  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 0);
-+  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 1);
-+  ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vdupq_laneq_u16 (uint16x8_t __a, const int __b)
-+__extension__ extern __inline poly64x1x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_p64 (const poly64_t * __a)
- {
--  return __aarch64_vdupq_laneq_u16 (__a, __b);
-+  poly64x1x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 0);
-+  ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 1);
-+  ret.val[2] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vdupq_laneq_u32 (uint32x4_t __a, const int __b)
-+__extension__ extern __inline int8x16x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_s8 (const int8_t * __a)
- {
--  return __aarch64_vdupq_laneq_u32 (__a, __b);
-+  int8x16x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
-+  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
-+  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vdupq_laneq_u64 (uint64x2_t __a, const int __b)
-+__extension__ extern __inline poly8x16x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_p8 (const poly8_t * __a)
- {
--  return __aarch64_vdupq_laneq_u64 (__a, __b);
-+  poly8x16x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
-+  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
-+  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
-+  return ret;
- }
- 
--/* vdupb_lane  */
--__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
--vdupb_lane_p8 (poly8x8_t __a, const int __b)
-+__extension__ extern __inline int16x8x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_s16 (const int16_t * __a)
- {
--  return __aarch64_vget_lane_any (__a, __b);
-+  int16x8x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
-+  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
-+  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vdupb_lane_s8 (int8x8_t __a, const int __b)
-+__extension__ extern __inline poly16x8x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_p16 (const poly16_t * __a)
- {
--  return __aarch64_vget_lane_any (__a, __b);
-+  poly16x8x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
-+  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
-+  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vdupb_lane_u8 (uint8x8_t __a, const int __b)
-+__extension__ extern __inline int32x4x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_s32 (const int32_t * __a)
- {
--  return __aarch64_vget_lane_any (__a, __b);
-+  int32x4x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a);
-+  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
-+  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
-+  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
-+  return ret;
- }
- 
--/* vduph_lane  */
--__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
--vduph_lane_p16 (poly16x4_t __a, const int __b)
-+__extension__ extern __inline int64x2x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_s64 (const int64_t * __a)
- {
--  return __aarch64_vget_lane_any (__a, __b);
-+  int64x2x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
-+  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
-+  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vduph_lane_s16 (int16x4_t __a, const int __b)
-+__extension__ extern __inline uint8x16x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_u8 (const uint8_t * __a)
- {
--  return __aarch64_vget_lane_any (__a, __b);
-+  uint8x16x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
-+  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
-+  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vduph_lane_u16 (uint16x4_t __a, const int __b)
-+__extension__ extern __inline uint16x8x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_u16 (const uint16_t * __a)
- {
--  return __aarch64_vget_lane_any (__a, __b);
-+  uint16x8x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
-+  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
-+  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
-+  return ret;
- }
- 
--/* vdups_lane  */
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vdups_lane_f32 (float32x2_t __a, const int __b)
-+__extension__ extern __inline uint32x4x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_u32 (const uint32_t * __a)
- {
--  return __aarch64_vget_lane_any (__a, __b);
-+  uint32x4x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a);
-+  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
-+  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
-+  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vdups_lane_s32 (int32x2_t __a, const int __b)
-+__extension__ extern __inline uint64x2x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_u64 (const uint64_t * __a)
- {
--  return __aarch64_vget_lane_any (__a, __b);
-+  uint64x2x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
-+  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
-+  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vdups_lane_u32 (uint32x2_t __a, const int __b)
-+__extension__ extern __inline float16x8x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_f16 (const float16_t * __a)
- {
--  return __aarch64_vget_lane_any (__a, __b);
-+  float16x8x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3v8hf (__a);
-+  ret.val[0] = __builtin_aarch64_get_qregciv8hf (__o, 0);
-+  ret.val[1] = __builtin_aarch64_get_qregciv8hf (__o, 1);
-+  ret.val[2] = __builtin_aarch64_get_qregciv8hf (__o, 2);
-+  return ret;
- }
- 
--/* vdupd_lane  */
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vdupd_lane_f64 (float64x1_t __a, const int __b)
-+__extension__ extern __inline float32x4x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_f32 (const float32_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __b);
--  return __a[0];
-+  float32x4x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3v4sf ((const __builtin_aarch64_simd_sf *) __a);
-+  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 0);
-+  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 1);
-+  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vdupd_lane_s64 (int64x1_t __a, const int __b)
-+__extension__ extern __inline float64x2x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_f64 (const float64_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __b);
--  return __a[0];
-+  float64x2x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3v2df ((const __builtin_aarch64_simd_df *) __a);
-+  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 0);
-+  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 1);
-+  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vdupd_lane_u64 (uint64x1_t __a, const int __b)
-+__extension__ extern __inline poly64x2x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_p64 (const poly64_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __b);
--  return __a[0];
-+  poly64x2x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 0);
-+  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 1);
-+  ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 2);
-+  return ret;
- }
- 
--/* vdupb_laneq  */
--__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
--vdupb_laneq_p8 (poly8x16_t __a, const int __b)
-+__extension__ extern __inline int64x1x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_s64 (const int64_t * __a)
- {
--  return __aarch64_vget_lane_any (__a, __b);
-+  int64x1x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 0);
-+  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 1);
-+  ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 2);
-+  ret.val[3] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vdupb_laneq_s8 (int8x16_t __a, const int __attribute__ ((unused)) __b)
-+__extension__ extern __inline uint64x1x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_u64 (const uint64_t * __a)
- {
--  return __aarch64_vget_lane_any (__a, __b);
-+  uint64x1x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 0);
-+  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 1);
-+  ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 2);
-+  ret.val[3] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vdupb_laneq_u8 (uint8x16_t __a, const int __b)
-+__extension__ extern __inline float64x1x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_f64 (const float64_t * __a)
- {
--  return __aarch64_vget_lane_any (__a, __b);
-+  float64x1x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4df ((const __builtin_aarch64_simd_df *) __a);
-+  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 0)};
-+  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 1)};
-+  ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 2)};
-+  ret.val[3] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 3)};
-+  return ret;
- }
- 
--/* vduph_laneq  */
--__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
--vduph_laneq_p16 (poly16x8_t __a, const int __b)
-+__extension__ extern __inline int8x8x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_s8 (const int8_t * __a)
- {
--  return __aarch64_vget_lane_any (__a, __b);
-+  int8x8x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4v8qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
-+  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
-+  ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
-+  ret.val[3] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vduph_laneq_s16 (int16x8_t __a, const int __b)
-+__extension__ extern __inline poly8x8x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_p8 (const poly8_t * __a)
- {
--  return __aarch64_vget_lane_any (__a, __b);
-+  poly8x8x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4v8qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
-+  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
-+  ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
-+  ret.val[3] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vduph_laneq_u16 (uint16x8_t __a, const int __b)
-+__extension__ extern __inline int16x4x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_s16 (const int16_t * __a)
- {
--  return __aarch64_vget_lane_any (__a, __b);
-+  int16x4x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4v4hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
-+  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
-+  ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
-+  ret.val[3] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
-+  return ret;
- }
- 
--/* vdups_laneq  */
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vdups_laneq_f32 (float32x4_t __a, const int __b)
-+__extension__ extern __inline poly16x4x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_p16 (const poly16_t * __a)
- {
--  return __aarch64_vget_lane_any (__a, __b);
-+  poly16x4x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4v4hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
-+  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
-+  ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
-+  ret.val[3] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vdups_laneq_s32 (int32x4_t __a, const int __b)
-+__extension__ extern __inline int32x2x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_s32 (const int32_t * __a)
- {
--  return __aarch64_vget_lane_any (__a, __b);
-+  int32x2x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4v2si ((const __builtin_aarch64_simd_si *) __a);
-+  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0);
-+  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1);
-+  ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2);
-+  ret.val[3] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vdups_laneq_u32 (uint32x4_t __a, const int __b)
-+__extension__ extern __inline uint8x8x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_u8 (const uint8_t * __a)
- {
--  return __aarch64_vget_lane_any (__a, __b);
-+  uint8x8x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4v8qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
-+  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
-+  ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
-+  ret.val[3] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
-+  return ret;
- }
- 
--/* vdupd_laneq  */
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vdupd_laneq_f64 (float64x2_t __a, const int __b)
-+__extension__ extern __inline uint16x4x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_u16 (const uint16_t * __a)
- {
--  return __aarch64_vget_lane_any (__a, __b);
-+  uint16x4x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4v4hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
-+  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
-+  ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
-+  ret.val[3] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vdupd_laneq_s64 (int64x2_t __a, const int __b)
-+__extension__ extern __inline uint32x2x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_u32 (const uint32_t * __a)
- {
--  return __aarch64_vget_lane_any (__a, __b);
-+  uint32x2x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4v2si ((const __builtin_aarch64_simd_si *) __a);
-+  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0);
-+  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1);
-+  ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2);
-+  ret.val[3] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vdupd_laneq_u64 (uint64x2_t __a, const int __b)
-+__extension__ extern __inline float16x4x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_f16 (const float16_t * __a)
- {
--  return __aarch64_vget_lane_any (__a, __b);
-+  float16x4x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4v4hf (__a);
-+  ret.val[0] = __builtin_aarch64_get_dregxiv4hf (__o, 0);
-+  ret.val[1] = __builtin_aarch64_get_dregxiv4hf (__o, 1);
-+  ret.val[2] = __builtin_aarch64_get_dregxiv4hf (__o, 2);
-+  ret.val[3] = __builtin_aarch64_get_dregxiv4hf (__o, 3);
-+  return ret;
- }
- 
--/* vext  */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vext_f32 (float32x2_t __a, float32x2_t __b, __const int __c)
-+__extension__ extern __inline float32x2x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_f32 (const float32_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint32x2_t) {2-__c, 3-__c});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {__c, __c+1});
--#endif
-+  float32x2x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4v2sf ((const __builtin_aarch64_simd_sf *) __a);
-+  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 0);
-+  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 1);
-+  ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 2);
-+  ret.val[3] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vext_f64 (float64x1_t __a, float64x1_t __b, __const int __c)
-+__extension__ extern __inline poly64x1x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_p64 (const poly64_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __c);
--  /* The only possible index to the assembler instruction returns element 0.  */
--  return __a;
-+  poly64x1x4_t  ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 0);
-+  ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 1);
-+  ret.val[2] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 2);
-+  ret.val[3] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 3);
-+  return ret;
- }
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vext_p8 (poly8x8_t __a, poly8x8_t __b, __const int __c)
-+
-+__extension__ extern __inline int8x16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_s8 (const int8_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint8x8_t)
--      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
--#else
--  return __builtin_shuffle (__a, __b,
--      (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
--#endif
-+  int8x16x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
-+  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
-+  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
-+  ret.val[3] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
--vext_p16 (poly16x4_t __a, poly16x4_t __b, __const int __c)
-+__extension__ extern __inline poly8x16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_p8 (const poly8_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a,
--      (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3});
--#endif
-+  poly8x16x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
-+  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
-+  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
-+  ret.val[3] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vext_s8 (int8x8_t __a, int8x8_t __b, __const int __c)
-+__extension__ extern __inline int16x8x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_s16 (const int16_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint8x8_t)
--      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
--#else
--  return __builtin_shuffle (__a, __b,
--      (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
--#endif
-+  int16x8x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
-+  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
-+  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
-+  ret.val[3] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vext_s16 (int16x4_t __a, int16x4_t __b, __const int __c)
-+__extension__ extern __inline poly16x8x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_p16 (const poly16_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a,
--      (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3});
--#endif
-+  poly16x8x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
-+  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
-+  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
-+  ret.val[3] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vext_s32 (int32x2_t __a, int32x2_t __b, __const int __c)
-+__extension__ extern __inline int32x4x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_s32 (const int32_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint32x2_t) {2-__c, 3-__c});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {__c, __c+1});
--#endif
-+  int32x4x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
-+  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0);
-+  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1);
-+  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
-+  ret.val[3] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vext_s64 (int64x1_t __a, int64x1_t __b, __const int __c)
-+__extension__ extern __inline int64x2x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_s64 (const int64_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __c);
--  /* The only possible index to the assembler instruction returns element 0.  */
--  return __a;
-+  int64x2x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0);
-+  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1);
-+  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
-+  ret.val[3] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vext_u8 (uint8x8_t __a, uint8x8_t __b, __const int __c)
-+__extension__ extern __inline uint8x16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_u8 (const uint8_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint8x8_t)
--      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
--#else
--  return __builtin_shuffle (__a, __b,
--      (uint8x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
--#endif
-+  uint8x16x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
-+  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
-+  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
-+  ret.val[3] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vext_u16 (uint16x4_t __a, uint16x4_t __b, __const int __c)
-+__extension__ extern __inline uint16x8x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_u16 (const uint16_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a,
--      (uint16x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {__c, __c+1, __c+2, __c+3});
--#endif
-+  uint16x8x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
-+  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
-+  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
-+  ret.val[3] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vext_u32 (uint32x2_t __a, uint32x2_t __b, __const int __c)
-+__extension__ extern __inline uint32x4x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_u32 (const uint32_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint32x2_t) {2-__c, 3-__c});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {__c, __c+1});
--#endif
-+  uint32x4x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
-+  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0);
-+  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1);
-+  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
-+  ret.val[3] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vext_u64 (uint64x1_t __a, uint64x1_t __b, __const int __c)
-+__extension__ extern __inline uint64x2x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_u64 (const uint64_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __c);
--  /* The only possible index to the assembler instruction returns element 0.  */
--  return __a;
-+  uint64x2x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0);
-+  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1);
-+  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
-+  ret.val[3] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vextq_f32 (float32x4_t __a, float32x4_t __b, __const int __c)
-+__extension__ extern __inline float16x8x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_f16 (const float16_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a,
--      (uint32x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {__c, __c+1, __c+2, __c+3});
--#endif
-+  float16x8x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4v8hf (__a);
-+  ret.val[0] = __builtin_aarch64_get_qregxiv8hf (__o, 0);
-+  ret.val[1] = __builtin_aarch64_get_qregxiv8hf (__o, 1);
-+  ret.val[2] = __builtin_aarch64_get_qregxiv8hf (__o, 2);
-+  ret.val[3] = __builtin_aarch64_get_qregxiv8hf (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vextq_f64 (float64x2_t __a, float64x2_t __b, __const int __c)
-+__extension__ extern __inline float32x4x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_f32 (const float32_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c});
--#else
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {__c, __c+1});
--#endif
-+  float32x4x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4v4sf ((const __builtin_aarch64_simd_sf *) __a);
-+  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 0);
-+  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 1);
-+  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 2);
-+  ret.val[3] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vextq_p8 (poly8x16_t __a, poly8x16_t __b, __const int __c)
-+__extension__ extern __inline float64x2x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_f64 (const float64_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint8x16_t)
--      {16-__c, 17-__c, 18-__c, 19-__c, 20-__c, 21-__c, 22-__c, 23-__c,
--       24-__c, 25-__c, 26-__c, 27-__c, 28-__c, 29-__c, 30-__c, 31-__c});
--#else
--  return __builtin_shuffle (__a, __b, (uint8x16_t)
--      {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7,
--       __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15});
--#endif
-+  float64x2x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4v2df ((const __builtin_aarch64_simd_df *) __a);
-+  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 0);
-+  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 1);
-+  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 2);
-+  ret.val[3] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vextq_p16 (poly16x8_t __a, poly16x8_t __b, __const int __c)
-+__extension__ extern __inline poly64x2x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_p64 (const poly64_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint16x8_t)
--      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
--#else
--  return __builtin_shuffle (__a, __b,
--      (uint16x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
--#endif
-+  poly64x2x4_t  ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 0);
-+  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 1);
-+  ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 2);
-+  ret.val[3] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vextq_s8 (int8x16_t __a, int8x16_t __b, __const int __c)
-+/* vldn_dup */
-+
-+__extension__ extern __inline int8x8x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_dup_s8 (const int8_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint8x16_t)
--      {16-__c, 17-__c, 18-__c, 19-__c, 20-__c, 21-__c, 22-__c, 23-__c,
--       24-__c, 25-__c, 26-__c, 27-__c, 28-__c, 29-__c, 30-__c, 31-__c});
--#else
--  return __builtin_shuffle (__a, __b, (uint8x16_t)
--      {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7,
--       __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15});
--#endif
-+  int8x8x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv8qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
-+  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vextq_s16 (int16x8_t __a, int16x8_t __b, __const int __c)
-+__extension__ extern __inline int16x4x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_dup_s16 (const int16_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint16x8_t)
--      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
--#else
--  return __builtin_shuffle (__a, __b,
--      (uint16x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
--#endif
-+  int16x4x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv4hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
-+  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vextq_s32 (int32x4_t __a, int32x4_t __b, __const int __c)
-+__extension__ extern __inline int32x2x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_dup_s32 (const int32_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a,
--      (uint32x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {__c, __c+1, __c+2, __c+3});
--#endif
-+  int32x2x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv2si ((const __builtin_aarch64_simd_si *) __a);
-+  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
-+  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vextq_s64 (int64x2_t __a, int64x2_t __b, __const int __c)
-+__extension__ extern __inline float16x4x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_dup_f16 (const float16_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c});
--#else
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {__c, __c+1});
--#endif
-+  float16x4x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv4hf ((const __builtin_aarch64_simd_hf *) __a);
-+  ret.val[0] = __builtin_aarch64_get_dregoiv4hf (__o, 0);
-+  ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregoiv4hf (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vextq_u8 (uint8x16_t __a, uint8x16_t __b, __const int __c)
-+__extension__ extern __inline float32x2x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_dup_f32 (const float32_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint8x16_t)
--      {16-__c, 17-__c, 18-__c, 19-__c, 20-__c, 21-__c, 22-__c, 23-__c,
--       24-__c, 25-__c, 26-__c, 27-__c, 28-__c, 29-__c, 30-__c, 31-__c});
--#else
--  return __builtin_shuffle (__a, __b, (uint8x16_t)
--      {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7,
--       __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15});
--#endif
-+  float32x2x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv2sf ((const __builtin_aarch64_simd_sf *) __a);
-+  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 0);
-+  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vextq_u16 (uint16x8_t __a, uint16x8_t __b, __const int __c)
-+__extension__ extern __inline float64x1x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_dup_f64 (const float64_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint16x8_t)
--      {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
--#else
--  return __builtin_shuffle (__a, __b,
--      (uint16x8_t) {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
--#endif
-+  float64x1x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rdf ((const __builtin_aarch64_simd_df *) __a);
-+  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 0)};
-+  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 1)};
-+  return ret;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vextq_u32 (uint32x4_t __a, uint32x4_t __b, __const int __c)
-+__extension__ extern __inline uint8x8x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_dup_u8 (const uint8_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a,
--      (uint32x4_t) {4-__c, 5-__c, 6-__c, 7-__c});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {__c, __c+1, __c+2, __c+3});
--#endif
-+  uint8x8x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv8qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
-+  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vextq_u64 (uint64x2_t __a, uint64x2_t __b, __const int __c)
-+__extension__ extern __inline uint16x4x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_dup_u16 (const uint16_t * __a)
- {
--  __AARCH64_LANE_CHECK (__a, __c);
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c});
--#else
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {__c, __c+1});
--#endif
-+  uint16x4x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv4hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
-+  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
-+  return ret;
- }
- 
--/* vfma  */
--
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vfma_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
-+__extension__ extern __inline uint32x2x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_dup_u32 (const uint32_t * __a)
- {
--  return (float64x1_t) {__builtin_fma (__b[0], __c[0], __a[0])};
-+  uint32x2x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv2si ((const __builtin_aarch64_simd_si *) __a);
-+  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
-+  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vfma_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
-+__extension__ extern __inline poly8x8x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_dup_p8 (const poly8_t * __a)
- {
--  return __builtin_aarch64_fmav2sf (__b, __c, __a);
-+  poly8x8x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv8qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
-+  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vfmaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
-+__extension__ extern __inline poly16x4x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_dup_p16 (const poly16_t * __a)
- {
--  return __builtin_aarch64_fmav4sf (__b, __c, __a);
-+  poly16x4x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv4hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
-+  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vfmaq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
-+__extension__ extern __inline poly64x1x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_dup_p64 (const poly64_t * __a)
- {
--  return __builtin_aarch64_fmav2df (__b, __c, __a);
-+  poly64x1x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregoidi_pss (__o, 0);
-+  ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregoidi_pss (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
-+
-+__extension__ extern __inline int64x1x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_dup_s64 (const int64_t * __a)
- {
--  return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a);
-+  int64x1x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rdi ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 0);
-+  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
-+__extension__ extern __inline uint64x1x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2_dup_u64 (const uint64_t * __a)
- {
--  return __builtin_aarch64_fmav4sf (__b, vdupq_n_f32 (__c), __a);
-+  uint64x1x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rdi ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 0);
-+  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vfmaq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c)
-+__extension__ extern __inline int8x16x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_dup_s8 (const int8_t * __a)
- {
--  return __builtin_aarch64_fmav2df (__b, vdupq_n_f64 (__c), __a);
-+  int8x16x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv16qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
-+  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
-+  return ret;
- }
- 
--/* vfma_lane  */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vfma_lane_f32 (float32x2_t __a, float32x2_t __b,
--	       float32x2_t __c, const int __lane)
-+__extension__ extern __inline poly8x16x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_dup_p8 (const poly8_t * __a)
- {
--  return __builtin_aarch64_fmav2sf (__b,
--				    __aarch64_vdup_lane_f32 (__c, __lane),
--				    __a);
-+  poly8x16x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv16qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
-+  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vfma_lane_f64 (float64x1_t __a, float64x1_t __b,
--	       float64x1_t __c, const int __lane)
-+__extension__ extern __inline int16x8x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_dup_s16 (const int16_t * __a)
- {
--  return (float64x1_t) {__builtin_fma (__b[0], __c[0], __a[0])};
-+  int16x8x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv8hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
-+  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vfmad_lane_f64 (float64_t __a, float64_t __b,
--	        float64x1_t __c, const int __lane)
-+__extension__ extern __inline poly16x8x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_dup_p16 (const poly16_t * __a)
- {
--  return __builtin_fma (__b, __c[0], __a);
-+  poly16x8x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv8hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
-+  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vfmas_lane_f32 (float32_t __a, float32_t __b,
--	        float32x2_t __c, const int __lane)
-+__extension__ extern __inline int32x4x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_dup_s32 (const int32_t * __a)
- {
--  return __builtin_fmaf (__b, __aarch64_vget_lane_any (__c, __lane), __a);
-+  int32x4x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv4si ((const __builtin_aarch64_simd_si *) __a);
-+  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
-+  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
-+  return ret;
- }
- 
--/* vfma_laneq  */
-+__extension__ extern __inline int64x2x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_dup_s64 (const int64_t * __a)
-+{
-+  int64x2x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
-+  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
-+  return ret;
-+}
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vfma_laneq_f32 (float32x2_t __a, float32x2_t __b,
--	        float32x4_t __c, const int __lane)
-+__extension__ extern __inline uint8x16x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_dup_u8 (const uint8_t * __a)
- {
--  return __builtin_aarch64_fmav2sf (__b,
--				    __aarch64_vdup_laneq_f32 (__c, __lane),
--				    __a);
-+  uint8x16x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv16qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
-+  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vfma_laneq_f64 (float64x1_t __a, float64x1_t __b,
--	        float64x2_t __c, const int __lane)
-+__extension__ extern __inline uint16x8x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_dup_u16 (const uint16_t * __a)
- {
--  float64_t __c0 = __aarch64_vget_lane_any (__c, __lane);
--  return (float64x1_t) {__builtin_fma (__b[0], __c0, __a[0])};
-+  uint16x8x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv8hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
-+  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vfmad_laneq_f64 (float64_t __a, float64_t __b,
--	         float64x2_t __c, const int __lane)
-+__extension__ extern __inline uint32x4x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_dup_u32 (const uint32_t * __a)
- {
--  return __builtin_fma (__b, __aarch64_vget_lane_any (__c, __lane), __a);
-+  uint32x4x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv4si ((const __builtin_aarch64_simd_si *) __a);
-+  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
-+  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vfmas_laneq_f32 (float32_t __a, float32_t __b,
--		 float32x4_t __c, const int __lane)
-+__extension__ extern __inline uint64x2x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_dup_u64 (const uint64_t * __a)
- {
--  return __builtin_fmaf (__b, __aarch64_vget_lane_any (__c, __lane), __a);
-+  uint64x2x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
-+  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
-+  return ret;
- }
- 
--/* vfmaq_lane  */
--
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vfmaq_lane_f32 (float32x4_t __a, float32x4_t __b,
--	        float32x2_t __c, const int __lane)
-+__extension__ extern __inline float16x8x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_dup_f16 (const float16_t * __a)
- {
--  return __builtin_aarch64_fmav4sf (__b,
--				    __aarch64_vdupq_lane_f32 (__c, __lane),
--				    __a);
-+  float16x8x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv8hf ((const __builtin_aarch64_simd_hf *) __a);
-+  ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregoiv8hf (__o, 0);
-+  ret.val[1] = __builtin_aarch64_get_qregoiv8hf (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vfmaq_lane_f64 (float64x2_t __a, float64x2_t __b,
--	        float64x1_t __c, const int __lane)
-+__extension__ extern __inline float32x4x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_dup_f32 (const float32_t * __a)
- {
--  return __builtin_aarch64_fmav2df (__b, vdupq_n_f64 (__c[0]), __a);
-+  float32x4x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv4sf ((const __builtin_aarch64_simd_sf *) __a);
-+  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 0);
-+  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 1);
-+  return ret;
- }
- 
--/* vfmaq_laneq  */
--
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vfmaq_laneq_f32 (float32x4_t __a, float32x4_t __b,
--	         float32x4_t __c, const int __lane)
-+__extension__ extern __inline float64x2x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_dup_f64 (const float64_t * __a)
- {
--  return __builtin_aarch64_fmav4sf (__b,
--				    __aarch64_vdupq_laneq_f32 (__c, __lane),
--				    __a);
-+  float64x2x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv2df ((const __builtin_aarch64_simd_df *) __a);
-+  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 0);
-+  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 1);
-+  return ret;
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vfmaq_laneq_f64 (float64x2_t __a, float64x2_t __b,
--	         float64x2_t __c, const int __lane)
-+__extension__ extern __inline poly64x2x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld2q_dup_p64 (const poly64_t * __a)
- {
--  return __builtin_aarch64_fmav2df (__b,
--				    __aarch64_vdupq_laneq_f64 (__c, __lane),
--				    __a);
-+  poly64x2x2_t ret;
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di_pss (__o, 0);
-+  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di_pss (__o, 1);
-+  return ret;
- }
- 
--/* vfms  */
--
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vfms_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
-+__extension__ extern __inline int64x1x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_dup_s64 (const int64_t * __a)
- {
--  return (float64x1_t) {__builtin_fma (-__b[0], __c[0], __a[0])};
-+  int64x1x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rdi ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
-+  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
-+  ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vfms_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
-+__extension__ extern __inline uint64x1x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_dup_u64 (const uint64_t * __a)
- {
--  return __builtin_aarch64_fmav2sf (-__b, __c, __a);
-+  uint64x1x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rdi ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
-+  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
-+  ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vfmsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
-+__extension__ extern __inline float64x1x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_dup_f64 (const float64_t * __a)
- {
--  return __builtin_aarch64_fmav4sf (-__b, __c, __a);
-+  float64x1x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rdf ((const __builtin_aarch64_simd_df *) __a);
-+  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 0)};
-+  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 1)};
-+  ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 2)};
-+  return ret;
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
-+__extension__ extern __inline int8x8x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_dup_s8 (const int8_t * __a)
- {
--  return __builtin_aarch64_fmav2df (-__b, __c, __a);
-+  int8x8x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv8qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
-+  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
-+  ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
-+  return ret;
- }
- 
--
--/* vfms_lane  */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vfms_lane_f32 (float32x2_t __a, float32x2_t __b,
--	       float32x2_t __c, const int __lane)
-+__extension__ extern __inline poly8x8x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_dup_p8 (const poly8_t * __a)
- {
--  return __builtin_aarch64_fmav2sf (-__b,
--				    __aarch64_vdup_lane_f32 (__c, __lane),
--				    __a);
-+  poly8x8x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv8qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
-+  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
-+  ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vfms_lane_f64 (float64x1_t __a, float64x1_t __b,
--	       float64x1_t __c, const int __lane)
-+__extension__ extern __inline int16x4x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_dup_s16 (const int16_t * __a)
- {
--  return (float64x1_t) {__builtin_fma (-__b[0], __c[0], __a[0])};
-+  int16x4x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv4hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
-+  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
-+  ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vfmsd_lane_f64 (float64_t __a, float64_t __b,
--	        float64x1_t __c, const int __lane)
-+__extension__ extern __inline poly16x4x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_dup_p16 (const poly16_t * __a)
- {
--  return __builtin_fma (-__b, __c[0], __a);
-+  poly16x4x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv4hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
-+  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
-+  ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vfmss_lane_f32 (float32_t __a, float32_t __b,
--	        float32x2_t __c, const int __lane)
-+__extension__ extern __inline int32x2x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_dup_s32 (const int32_t * __a)
- {
--  return __builtin_fmaf (-__b, __aarch64_vget_lane_any (__c, __lane), __a);
-+  int32x2x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv2si ((const __builtin_aarch64_simd_si *) __a);
-+  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0);
-+  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1);
-+  ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2);
-+  return ret;
- }
- 
--/* vfms_laneq  */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vfms_laneq_f32 (float32x2_t __a, float32x2_t __b,
--	        float32x4_t __c, const int __lane)
-+__extension__ extern __inline uint8x8x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_dup_u8 (const uint8_t * __a)
- {
--  return __builtin_aarch64_fmav2sf (-__b,
--				    __aarch64_vdup_laneq_f32 (__c, __lane),
--				    __a);
-+  uint8x8x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv8qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
-+  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
-+  ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vfms_laneq_f64 (float64x1_t __a, float64x1_t __b,
--	        float64x2_t __c, const int __lane)
-+__extension__ extern __inline uint16x4x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_dup_u16 (const uint16_t * __a)
- {
--  float64_t __c0 = __aarch64_vget_lane_any (__c, __lane);
--  return (float64x1_t) {__builtin_fma (-__b[0], __c0, __a[0])};
-+  uint16x4x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv4hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
-+  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
-+  ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vfmsd_laneq_f64 (float64_t __a, float64_t __b,
--	         float64x2_t __c, const int __lane)
-+__extension__ extern __inline uint32x2x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_dup_u32 (const uint32_t * __a)
- {
--  return __builtin_fma (-__b, __aarch64_vget_lane_any (__c, __lane), __a);
-+  uint32x2x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv2si ((const __builtin_aarch64_simd_si *) __a);
-+  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0);
-+  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1);
-+  ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vfmss_laneq_f32 (float32_t __a, float32_t __b,
--		 float32x4_t __c, const int __lane)
-+__extension__ extern __inline float16x4x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_dup_f16 (const float16_t * __a)
- {
--  return __builtin_fmaf (-__b, __aarch64_vget_lane_any (__c, __lane), __a);
-+  float16x4x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv4hf ((const __builtin_aarch64_simd_hf *) __a);
-+  ret.val[0] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 0);
-+  ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 1);
-+  ret.val[2] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 2);
-+  return ret;
- }
- 
--/* vfmsq_lane  */
--
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vfmsq_lane_f32 (float32x4_t __a, float32x4_t __b,
--	        float32x2_t __c, const int __lane)
-+__extension__ extern __inline float32x2x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_dup_f32 (const float32_t * __a)
- {
--  return __builtin_aarch64_fmav4sf (-__b,
--				    __aarch64_vdupq_lane_f32 (__c, __lane),
--				    __a);
-+  float32x2x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv2sf ((const __builtin_aarch64_simd_sf *) __a);
-+  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 0);
-+  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 1);
-+  ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vfmsq_lane_f64 (float64x2_t __a, float64x2_t __b,
--	        float64x1_t __c, const int __lane)
-+__extension__ extern __inline poly64x1x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3_dup_p64 (const poly64_t * __a)
- {
--  return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c[0]), __a);
-+  poly64x1x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 0);
-+  ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 1);
-+  ret.val[2] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 2);
-+  return ret;
- }
- 
--/* vfmsq_laneq  */
--
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vfmsq_laneq_f32 (float32x4_t __a, float32x4_t __b,
--	         float32x4_t __c, const int __lane)
-+__extension__ extern __inline int8x16x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_dup_s8 (const int8_t * __a)
- {
--  return __builtin_aarch64_fmav4sf (-__b,
--				    __aarch64_vdupq_laneq_f32 (__c, __lane),
--				    __a);
-+  int8x16x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv16qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
-+  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
-+  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vfmsq_laneq_f64 (float64x2_t __a, float64x2_t __b,
--	         float64x2_t __c, const int __lane)
-+__extension__ extern __inline poly8x16x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_dup_p8 (const poly8_t * __a)
- {
--  return __builtin_aarch64_fmav2df (-__b,
--				    __aarch64_vdupq_laneq_f64 (__c, __lane),
--				    __a);
-+  poly8x16x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv16qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
-+  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
-+  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
-+  return ret;
- }
- 
--/* vld1 */
-+__extension__ extern __inline int16x8x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_dup_s16 (const int16_t * __a)
-+{
-+  int16x8x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv8hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
-+  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
-+  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
-+  return ret;
-+}
- 
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
--vld1_f16 (const float16_t *__a)
-+__extension__ extern __inline poly16x8x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_dup_p16 (const poly16_t * __a)
- {
--  return __builtin_aarch64_ld1v4hf (__a);
-+  poly16x8x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv8hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
-+  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
-+  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vld1_f32 (const float32_t *a)
-+__extension__ extern __inline int32x4x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_dup_s32 (const int32_t * __a)
- {
--  return __builtin_aarch64_ld1v2sf ((const __builtin_aarch64_simd_sf *) a);
-+  int32x4x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv4si ((const __builtin_aarch64_simd_si *) __a);
-+  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
-+  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
-+  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vld1_f64 (const float64_t *a)
-+__extension__ extern __inline int64x2x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_dup_s64 (const int64_t * __a)
- {
--  return (float64x1_t) {*a};
-+  int64x2x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
-+  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
-+  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vld1_p8 (const poly8_t *a)
-+__extension__ extern __inline uint8x16x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_dup_u8 (const uint8_t * __a)
- {
--  return (poly8x8_t)
--    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
-+  uint8x16x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv16qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
-+  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
-+  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
--vld1_p16 (const poly16_t *a)
-+__extension__ extern __inline uint16x8x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_dup_u16 (const uint16_t * __a)
- {
--  return (poly16x4_t)
--    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
-+  uint16x8x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv8hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
-+  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
-+  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vld1_s8 (const int8_t *a)
-+__extension__ extern __inline uint32x4x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_dup_u32 (const uint32_t * __a)
- {
--  return __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
-+  uint32x4x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv4si ((const __builtin_aarch64_simd_si *) __a);
-+  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
-+  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
-+  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vld1_s16 (const int16_t *a)
-+__extension__ extern __inline uint64x2x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_dup_u64 (const uint64_t * __a)
- {
--  return __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
-+  uint64x2x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
-+  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
-+  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vld1_s32 (const int32_t *a)
-+__extension__ extern __inline float16x8x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_dup_f16 (const float16_t * __a)
- {
--  return __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) a);
-+  float16x8x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv8hf ((const __builtin_aarch64_simd_hf *) __a);
-+  ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 0);
-+  ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 1);
-+  ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vld1_s64 (const int64_t *a)
-+__extension__ extern __inline float32x4x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_dup_f32 (const float32_t * __a)
- {
--  return (int64x1_t) {*a};
-+  float32x4x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv4sf ((const __builtin_aarch64_simd_sf *) __a);
-+  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 0);
-+  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 1);
-+  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vld1_u8 (const uint8_t *a)
-+__extension__ extern __inline float64x2x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_dup_f64 (const float64_t * __a)
- {
--  return (uint8x8_t)
--    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
-+  float64x2x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv2df ((const __builtin_aarch64_simd_df *) __a);
-+  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 0);
-+  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 1);
-+  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vld1_u16 (const uint16_t *a)
-+__extension__ extern __inline poly64x2x3_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld3q_dup_p64 (const poly64_t * __a)
- {
--  return (uint16x4_t)
--    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
-+  poly64x2x3_t ret;
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 0);
-+  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 1);
-+  ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 2);
-+  return ret;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vld1_u32 (const uint32_t *a)
-+__extension__ extern __inline int64x1x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_dup_s64 (const int64_t * __a)
- {
--  return (uint32x2_t)
--    __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) a);
-+  int64x1x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rdi ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 0);
-+  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 1);
-+  ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 2);
-+  ret.val[3] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vld1_u64 (const uint64_t *a)
-+__extension__ extern __inline uint64x1x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_dup_u64 (const uint64_t * __a)
- {
--  return (uint64x1_t) {*a};
-+  uint64x1x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rdi ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 0);
-+  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 1);
-+  ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 2);
-+  ret.val[3] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 3);
-+  return ret;
- }
- 
--/* vld1q */
-+__extension__ extern __inline float64x1x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_dup_f64 (const float64_t * __a)
-+{
-+  float64x1x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rdf ((const __builtin_aarch64_simd_df *) __a);
-+  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 0)};
-+  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 1)};
-+  ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 2)};
-+  ret.val[3] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 3)};
-+  return ret;
-+}
- 
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
--vld1q_f16 (const float16_t *__a)
-+__extension__ extern __inline int8x8x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_dup_s8 (const int8_t * __a)
- {
--  return __builtin_aarch64_ld1v8hf (__a);
-+  int8x8x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv8qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
-+  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
-+  ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
-+  ret.val[3] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vld1q_f32 (const float32_t *a)
-+__extension__ extern __inline poly8x8x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_dup_p8 (const poly8_t * __a)
- {
--  return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) a);
-+  poly8x8x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv8qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
-+  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
-+  ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
-+  ret.val[3] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vld1q_f64 (const float64_t *a)
-+__extension__ extern __inline int16x4x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_dup_s16 (const int16_t * __a)
- {
--  return __builtin_aarch64_ld1v2df ((const __builtin_aarch64_simd_df *) a);
-+  int16x4x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv4hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
-+  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
-+  ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
-+  ret.val[3] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vld1q_p8 (const poly8_t *a)
-+__extension__ extern __inline poly16x4x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_dup_p16 (const poly16_t * __a)
- {
--  return (poly8x16_t)
--    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
-+  poly16x4x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv4hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
-+  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
-+  ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
-+  ret.val[3] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vld1q_p16 (const poly16_t *a)
-+__extension__ extern __inline int32x2x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_dup_s32 (const int32_t * __a)
- {
--  return (poly16x8_t)
--    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
-+  int32x2x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv2si ((const __builtin_aarch64_simd_si *) __a);
-+  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0);
-+  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1);
-+  ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2);
-+  ret.val[3] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vld1q_s8 (const int8_t *a)
-+__extension__ extern __inline uint8x8x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_dup_u8 (const uint8_t * __a)
- {
--  return __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
-+  uint8x8x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv8qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
-+  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
-+  ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
-+  ret.val[3] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vld1q_s16 (const int16_t *a)
-+__extension__ extern __inline uint16x4x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_dup_u16 (const uint16_t * __a)
- {
--  return __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
-+  uint16x4x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv4hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
-+  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
-+  ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
-+  ret.val[3] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vld1q_s32 (const int32_t *a)
-+__extension__ extern __inline uint32x2x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_dup_u32 (const uint32_t * __a)
- {
--  return __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) a);
-+  uint32x2x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv2si ((const __builtin_aarch64_simd_si *) __a);
-+  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0);
-+  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1);
-+  ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2);
-+  ret.val[3] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vld1q_s64 (const int64_t *a)
-+__extension__ extern __inline float16x4x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_dup_f16 (const float16_t * __a)
- {
--  return __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
-+  float16x4x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv4hf ((const __builtin_aarch64_simd_hf *) __a);
-+  ret.val[0] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 0);
-+  ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 1);
-+  ret.val[2] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 2);
-+  ret.val[3] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vld1q_u8 (const uint8_t *a)
-+__extension__ extern __inline float32x2x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_dup_f32 (const float32_t * __a)
- {
--  return (uint8x16_t)
--    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
-+  float32x2x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv2sf ((const __builtin_aarch64_simd_sf *) __a);
-+  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 0);
-+  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 1);
-+  ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 2);
-+  ret.val[3] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vld1q_u16 (const uint16_t *a)
-+__extension__ extern __inline poly64x1x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4_dup_p64 (const poly64_t * __a)
- {
--  return (uint16x8_t)
--    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
-+  poly64x1x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 0);
-+  ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 1);
-+  ret.val[2] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 2);
-+  ret.val[3] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vld1q_u32 (const uint32_t *a)
-+__extension__ extern __inline int8x16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_dup_s8 (const int8_t * __a)
- {
--  return (uint32x4_t)
--    __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) a);
-+  int8x16x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv16qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
-+  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
-+  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
-+  ret.val[3] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vld1q_u64 (const uint64_t *a)
-+__extension__ extern __inline poly8x16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_dup_p8 (const poly8_t * __a)
- {
--  return (uint64x2_t)
--    __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
-+  poly8x16x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv16qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
-+  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
-+  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
-+  ret.val[3] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
-+  return ret;
- }
- 
--/* vld1_dup  */
--
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
--vld1_dup_f16 (const float16_t* __a)
-+__extension__ extern __inline int16x8x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_dup_s16 (const int16_t * __a)
- {
--  float16_t __f = *__a;
--  return (float16x4_t) { __f, __f, __f, __f };
-+  int16x8x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv8hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
-+  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
-+  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
-+  ret.val[3] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vld1_dup_f32 (const float32_t* __a)
-+__extension__ extern __inline poly16x8x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_dup_p16 (const poly16_t * __a)
- {
--  return vdup_n_f32 (*__a);
-+  poly16x8x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv8hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
-+  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
-+  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
-+  ret.val[3] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vld1_dup_f64 (const float64_t* __a)
-+__extension__ extern __inline int32x4x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_dup_s32 (const int32_t * __a)
- {
--  return vdup_n_f64 (*__a);
-+  int32x4x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv4si ((const __builtin_aarch64_simd_si *) __a);
-+  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0);
-+  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1);
-+  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
-+  ret.val[3] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vld1_dup_p8 (const poly8_t* __a)
-+__extension__ extern __inline int64x2x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_dup_s64 (const int64_t * __a)
- {
--  return vdup_n_p8 (*__a);
-+  int64x2x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0);
-+  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1);
-+  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
-+  ret.val[3] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
--vld1_dup_p16 (const poly16_t* __a)
-+__extension__ extern __inline uint8x16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_dup_u8 (const uint8_t * __a)
- {
--  return vdup_n_p16 (*__a);
-+  uint8x16x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv16qi ((const __builtin_aarch64_simd_qi *) __a);
-+  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
-+  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
-+  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
-+  ret.val[3] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vld1_dup_s8 (const int8_t* __a)
-+__extension__ extern __inline uint16x8x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_dup_u16 (const uint16_t * __a)
- {
--  return vdup_n_s8 (*__a);
-+  uint16x8x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv8hi ((const __builtin_aarch64_simd_hi *) __a);
-+  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
-+  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
-+  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
-+  ret.val[3] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vld1_dup_s16 (const int16_t* __a)
-+__extension__ extern __inline uint32x4x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_dup_u32 (const uint32_t * __a)
- {
--  return vdup_n_s16 (*__a);
-+  uint32x4x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv4si ((const __builtin_aarch64_simd_si *) __a);
-+  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0);
-+  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1);
-+  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
-+  ret.val[3] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vld1_dup_s32 (const int32_t* __a)
-+__extension__ extern __inline uint64x2x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_dup_u64 (const uint64_t * __a)
- {
--  return vdup_n_s32 (*__a);
-+  uint64x2x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0);
-+  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1);
-+  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
-+  ret.val[3] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vld1_dup_s64 (const int64_t* __a)
-+__extension__ extern __inline float16x8x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_dup_f16 (const float16_t * __a)
- {
--  return vdup_n_s64 (*__a);
-+  float16x8x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv8hf ((const __builtin_aarch64_simd_hf *) __a);
-+  ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 0);
-+  ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 1);
-+  ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 2);
-+  ret.val[3] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vld1_dup_u8 (const uint8_t* __a)
-+__extension__ extern __inline float32x4x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_dup_f32 (const float32_t * __a)
- {
--  return vdup_n_u8 (*__a);
-+  float32x4x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv4sf ((const __builtin_aarch64_simd_sf *) __a);
-+  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 0);
-+  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 1);
-+  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 2);
-+  ret.val[3] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vld1_dup_u16 (const uint16_t* __a)
-+__extension__ extern __inline float64x2x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_dup_f64 (const float64_t * __a)
- {
--  return vdup_n_u16 (*__a);
-+  float64x2x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv2df ((const __builtin_aarch64_simd_df *) __a);
-+  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 0);
-+  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 1);
-+  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 2);
-+  ret.val[3] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vld1_dup_u32 (const uint32_t* __a)
-+__extension__ extern __inline poly64x2x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vld4q_dup_p64 (const poly64_t * __a)
- {
--  return vdup_n_u32 (*__a);
-+  poly64x2x4_t ret;
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a);
-+  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 0);
-+  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 1);
-+  ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 2);
-+  ret.val[3] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 3);
-+  return ret;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vld1_dup_u64 (const uint64_t* __a)
--{
--  return vdup_n_u64 (*__a);
-+/* vld2_lane */
-+
-+#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype, mode,	   \
-+			 qmode, ptrmode, funcsuffix, signedtype)	   \
-+__extension__ extern __inline intype \
-+__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \
-+vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
-+{									   \
-+  __builtin_aarch64_simd_oi __o;					   \
-+  largetype __temp;							   \
-+  __temp.val[0] =							   \
-+    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));	   \
-+  __temp.val[1] =							   \
-+    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));	   \
-+  __o = __builtin_aarch64_set_qregoi##qmode (__o,			   \
-+					    (signedtype) __temp.val[0],	   \
-+					    0);				   \
-+  __o = __builtin_aarch64_set_qregoi##qmode (__o,			   \
-+					    (signedtype) __temp.val[1],	   \
-+					    1);				   \
-+  __o =	__builtin_aarch64_ld2_lane##mode (				   \
-+	  (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);	   \
-+  __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0);	   \
-+  __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1);	   \
-+  return __b;								   \
- }
- 
--/* vld1q_dup  */
-+__LD2_LANE_FUNC (float16x4x2_t, float16x4_t, float16x8x2_t, float16_t, v4hf,
-+		 v8hf, hf, f16, float16x8_t)
-+__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v2sf, v4sf,
-+		 sf, f32, float32x4_t)
-+__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, df, v2df,
-+		 df, f64, float64x2_t)
-+__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v8qi, v16qi, qi, p8,
-+		 int8x16_t)
-+__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi,
-+		 p16, int16x8_t)
-+__LD2_LANE_FUNC (poly64x1x2_t, poly64x1_t, poly64x2x2_t, poly64_t, di,
-+		 v2di_ssps, di, p64, poly64x2_t)
-+__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8,
-+		 int8x16_t)
-+__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16,
-+		 int16x8_t)
-+__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v2si, v4si, si, s32,
-+		 int32x4_t)
-+__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, di, v2di, di, s64,
-+		 int64x2_t)
-+__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v8qi, v16qi, qi, u8,
-+		 int8x16_t)
-+__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v4hi, v8hi, hi,
-+		 u16, int16x8_t)
-+__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v2si, v4si, si,
-+		 u32, int32x4_t)
-+__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, di, v2di, di,
-+		 u64, int64x2_t)
- 
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
--vld1q_dup_f16 (const float16_t* __a)
--{
--  float16_t __f = *__a;
--  return (float16x8_t) { __f, __f, __f, __f, __f, __f, __f, __f };
-+#undef __LD2_LANE_FUNC
-+
-+/* vld2q_lane */
-+
-+#define __LD2_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
-+__extension__ extern __inline intype \
-+__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \
-+vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
-+{									   \
-+  __builtin_aarch64_simd_oi __o;					   \
-+  intype ret;								   \
-+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); \
-+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); \
-+  __o = __builtin_aarch64_ld2_lane##mode (				   \
-+	(__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);		   \
-+  ret.val[0] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 0);	   \
-+  ret.val[1] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 1);	   \
-+  return ret;								   \
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vld1q_dup_f32 (const float32_t* __a)
--{
--  return vdupq_n_f32 (*__a);
-+__LD2_LANE_FUNC (float16x8x2_t, float16x8_t, float16_t, v8hf, hf, f16)
-+__LD2_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32)
-+__LD2_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64)
-+__LD2_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8)
-+__LD2_LANE_FUNC (poly16x8x2_t, poly16x8_t, poly16_t, v8hi, hi, p16)
-+__LD2_LANE_FUNC (poly64x2x2_t, poly64x2_t, poly64_t, v2di, di, p64)
-+__LD2_LANE_FUNC (int8x16x2_t, int8x16_t, int8_t, v16qi, qi, s8)
-+__LD2_LANE_FUNC (int16x8x2_t, int16x8_t, int16_t, v8hi, hi, s16)
-+__LD2_LANE_FUNC (int32x4x2_t, int32x4_t, int32_t, v4si, si, s32)
-+__LD2_LANE_FUNC (int64x2x2_t, int64x2_t, int64_t, v2di, di, s64)
-+__LD2_LANE_FUNC (uint8x16x2_t, uint8x16_t, uint8_t, v16qi, qi, u8)
-+__LD2_LANE_FUNC (uint16x8x2_t, uint16x8_t, uint16_t, v8hi, hi, u16)
-+__LD2_LANE_FUNC (uint32x4x2_t, uint32x4_t, uint32_t, v4si, si, u32)
-+__LD2_LANE_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64)
-+
-+#undef __LD2_LANE_FUNC
-+
-+/* vld3_lane */
-+
-+#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype, mode,	   \
-+			 qmode, ptrmode, funcsuffix, signedtype)	   \
-+__extension__ extern __inline intype \
-+__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \
-+vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
-+{									   \
-+  __builtin_aarch64_simd_ci __o;					   \
-+  largetype __temp;							   \
-+  __temp.val[0] =							   \
-+    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));	   \
-+  __temp.val[1] =							   \
-+    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));	   \
-+  __temp.val[2] =							   \
-+    vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0));	   \
-+  __o = __builtin_aarch64_set_qregci##qmode (__o,			   \
-+					    (signedtype) __temp.val[0],	   \
-+					    0);				   \
-+  __o = __builtin_aarch64_set_qregci##qmode (__o,			   \
-+					    (signedtype) __temp.val[1],	   \
-+					    1);				   \
-+  __o = __builtin_aarch64_set_qregci##qmode (__o,			   \
-+					    (signedtype) __temp.val[2],	   \
-+					    2);				   \
-+  __o =	__builtin_aarch64_ld3_lane##mode (				   \
-+	  (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);	   \
-+  __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0);	   \
-+  __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1);	   \
-+  __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2);	   \
-+  return __b;								   \
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vld1q_dup_f64 (const float64_t* __a)
--{
--  return vdupq_n_f64 (*__a);
--}
-+__LD3_LANE_FUNC (float16x4x3_t, float16x4_t, float16x8x3_t, float16_t, v4hf,
-+		 v8hf, hf, f16, float16x8_t)
-+__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v2sf, v4sf,
-+		 sf, f32, float32x4_t)
-+__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, df, v2df,
-+		 df, f64, float64x2_t)
-+__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v8qi, v16qi, qi, p8,
-+		 int8x16_t)
-+__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v4hi, v8hi, hi,
-+		 p16, int16x8_t)
-+__LD3_LANE_FUNC (poly64x1x3_t, poly64x1_t, poly64x2x3_t, poly64_t, di,
-+		 v2di_ssps, di, p64, poly64x2_t)
-+__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v8qi, v16qi, qi, s8,
-+		 int8x16_t)
-+__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v4hi, v8hi, hi, s16,
-+		 int16x8_t)
-+__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v2si, v4si, si, s32,
-+		 int32x4_t)
-+__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, di, v2di, di, s64,
-+		 int64x2_t)
-+__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v8qi, v16qi, qi, u8,
-+		 int8x16_t)
-+__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v4hi, v8hi, hi,
-+		 u16, int16x8_t)
-+__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v2si, v4si, si,
-+		 u32, int32x4_t)
-+__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, di, v2di, di,
-+		 u64, int64x2_t)
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vld1q_dup_p8 (const poly8_t* __a)
--{
--  return vdupq_n_p8 (*__a);
--}
-+#undef __LD3_LANE_FUNC
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vld1q_dup_p16 (const poly16_t* __a)
--{
--  return vdupq_n_p16 (*__a);
--}
-+/* vld3q_lane */
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vld1q_dup_s8 (const int8_t* __a)
--{
--  return vdupq_n_s8 (*__a);
-+#define __LD3_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
-+__extension__ extern __inline intype \
-+__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \
-+vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
-+{									   \
-+  __builtin_aarch64_simd_ci __o;					   \
-+  intype ret;								   \
-+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); \
-+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); \
-+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); \
-+  __o = __builtin_aarch64_ld3_lane##mode (				   \
-+	(__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);		   \
-+  ret.val[0] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 0);	   \
-+  ret.val[1] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 1);	   \
-+  ret.val[2] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 2);	   \
-+  return ret;								   \
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vld1q_dup_s16 (const int16_t* __a)
--{
--  return vdupq_n_s16 (*__a);
--}
-+__LD3_LANE_FUNC (float16x8x3_t, float16x8_t, float16_t, v8hf, hf, f16)
-+__LD3_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32)
-+__LD3_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64)
-+__LD3_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8)
-+__LD3_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16)
-+__LD3_LANE_FUNC (poly64x2x3_t, poly64x2_t, poly64_t, v2di, di, p64)
-+__LD3_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8)
-+__LD3_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16)
-+__LD3_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32)
-+__LD3_LANE_FUNC (int64x2x3_t, int64x2_t, int64_t, v2di, di, s64)
-+__LD3_LANE_FUNC (uint8x16x3_t, uint8x16_t, uint8_t, v16qi, qi, u8)
-+__LD3_LANE_FUNC (uint16x8x3_t, uint16x8_t, uint16_t, v8hi, hi, u16)
-+__LD3_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32)
-+__LD3_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64)
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vld1q_dup_s32 (const int32_t* __a)
--{
--  return vdupq_n_s32 (*__a);
--}
-+#undef __LD3_LANE_FUNC
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vld1q_dup_s64 (const int64_t* __a)
--{
--  return vdupq_n_s64 (*__a);
--}
-+/* vld4_lane */
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vld1q_dup_u8 (const uint8_t* __a)
--{
--  return vdupq_n_u8 (*__a);
-+#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype, mode,	   \
-+			 qmode, ptrmode, funcsuffix, signedtype)	   \
-+__extension__ extern __inline intype \
-+__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \
-+vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
-+{									   \
-+  __builtin_aarch64_simd_xi __o;					   \
-+  largetype __temp;							   \
-+  __temp.val[0] =							   \
-+    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));	   \
-+  __temp.val[1] =							   \
-+    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));	   \
-+  __temp.val[2] =							   \
-+    vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0));	   \
-+  __temp.val[3] =							   \
-+    vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0));	   \
-+  __o = __builtin_aarch64_set_qregxi##qmode (__o,			   \
-+					    (signedtype) __temp.val[0],	   \
-+					    0);				   \
-+  __o = __builtin_aarch64_set_qregxi##qmode (__o,			   \
-+					    (signedtype) __temp.val[1],	   \
-+					    1);				   \
-+  __o = __builtin_aarch64_set_qregxi##qmode (__o,			   \
-+					    (signedtype) __temp.val[2],	   \
-+					    2);				   \
-+  __o = __builtin_aarch64_set_qregxi##qmode (__o,			   \
-+					    (signedtype) __temp.val[3],	   \
-+					    3);				   \
-+  __o =	__builtin_aarch64_ld4_lane##mode (				   \
-+	  (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);	   \
-+  __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0);	   \
-+  __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1);	   \
-+  __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2);	   \
-+  __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3);	   \
-+  return __b;								   \
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vld1q_dup_u16 (const uint16_t* __a)
--{
--  return vdupq_n_u16 (*__a);
--}
-+/* vld4q_lane */
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vld1q_dup_u32 (const uint32_t* __a)
--{
--  return vdupq_n_u32 (*__a);
--}
-+__LD4_LANE_FUNC (float16x4x4_t, float16x4_t, float16x8x4_t, float16_t, v4hf,
-+		 v8hf, hf, f16, float16x8_t)
-+__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v2sf, v4sf,
-+		 sf, f32, float32x4_t)
-+__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, df, v2df,
-+		 df, f64, float64x2_t)
-+__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v8qi, v16qi, qi, p8,
-+		 int8x16_t)
-+__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v4hi, v8hi, hi,
-+		 p16, int16x8_t)
-+__LD4_LANE_FUNC (poly64x1x4_t, poly64x1_t, poly64x2x4_t, poly64_t, di,
-+		 v2di_ssps, di, p64, poly64x2_t)
-+__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v8qi, v16qi, qi, s8,
-+		 int8x16_t)
-+__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v4hi, v8hi, hi, s16,
-+		 int16x8_t)
-+__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v2si, v4si, si, s32,
-+		 int32x4_t)
-+__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, di, v2di, di, s64,
-+		 int64x2_t)
-+__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v8qi, v16qi, qi, u8,
-+		 int8x16_t)
-+__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v4hi, v8hi, hi,
-+		 u16, int16x8_t)
-+__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v2si, v4si, si,
-+		 u32, int32x4_t)
-+__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, di, v2di, di,
-+		 u64, int64x2_t)
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vld1q_dup_u64 (const uint64_t* __a)
--{
--  return vdupq_n_u64 (*__a);
--}
-+#undef __LD4_LANE_FUNC
- 
--/* vld1_lane  */
-+/* vld4q_lane */
- 
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
--vld1_lane_f16 (const float16_t *__src, float16x4_t __vec, const int __lane)
--{
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-+#define __LD4_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
-+__extension__ extern __inline intype \
-+__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \
-+vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
-+{									   \
-+  __builtin_aarch64_simd_xi __o;					   \
-+  intype ret;								   \
-+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); \
-+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); \
-+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); \
-+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); \
-+  __o = __builtin_aarch64_ld4_lane##mode (				   \
-+	(__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);		   \
-+  ret.val[0] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 0);	   \
-+  ret.val[1] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 1);	   \
-+  ret.val[2] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 2);	   \
-+  ret.val[3] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 3);	   \
-+  return ret;								   \
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vld1_lane_f32 (const float32_t *__src, float32x2_t __vec, const int __lane)
--{
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
--}
-+__LD4_LANE_FUNC (float16x8x4_t, float16x8_t, float16_t, v8hf, hf, f16)
-+__LD4_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32)
-+__LD4_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64)
-+__LD4_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8)
-+__LD4_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16)
-+__LD4_LANE_FUNC (poly64x2x4_t, poly64x2_t, poly64_t, v2di, di, p64)
-+__LD4_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8)
-+__LD4_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16)
-+__LD4_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32)
-+__LD4_LANE_FUNC (int64x2x4_t, int64x2_t, int64_t, v2di, di, s64)
-+__LD4_LANE_FUNC (uint8x16x4_t, uint8x16_t, uint8_t, v16qi, qi, u8)
-+__LD4_LANE_FUNC (uint16x8x4_t, uint16x8_t, uint16_t, v8hi, hi, u16)
-+__LD4_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32)
-+__LD4_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64)
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vld1_lane_f64 (const float64_t *__src, float64x1_t __vec, const int __lane)
--{
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
--}
-+#undef __LD4_LANE_FUNC
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vld1_lane_p8 (const poly8_t *__src, poly8x8_t __vec, const int __lane)
--{
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
--}
-+/* vmax */
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
--vld1_lane_p16 (const poly16_t *__src, poly16x4_t __vec, const int __lane)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmax_f32 (float32x2_t __a, float32x2_t __b)
- {
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-+  return __builtin_aarch64_smax_nanv2sf (__a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vld1_lane_s8 (const int8_t *__src, int8x8_t __vec, const int __lane)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmax_f64 (float64x1_t __a, float64x1_t __b)
- {
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-+    return (float64x1_t)
-+      { __builtin_aarch64_smax_nandf (vget_lane_f64 (__a, 0),
-+				      vget_lane_f64 (__b, 0)) };
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vld1_lane_s16 (const int16_t *__src, int16x4_t __vec, const int __lane)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmax_s8 (int8x8_t __a, int8x8_t __b)
- {
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-+  return __builtin_aarch64_smaxv8qi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vld1_lane_s32 (const int32_t *__src, int32x2_t __vec, const int __lane)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmax_s16 (int16x4_t __a, int16x4_t __b)
- {
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-+  return __builtin_aarch64_smaxv4hi (__a, __b);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vld1_lane_s64 (const int64_t *__src, int64x1_t __vec, const int __lane)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmax_s32 (int32x2_t __a, int32x2_t __b)
- {
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-+  return __builtin_aarch64_smaxv2si (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vld1_lane_u8 (const uint8_t *__src, uint8x8_t __vec, const int __lane)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmax_u8 (uint8x8_t __a, uint8x8_t __b)
- {
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-+  return (uint8x8_t) __builtin_aarch64_umaxv8qi ((int8x8_t) __a,
-+						 (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vld1_lane_u16 (const uint16_t *__src, uint16x4_t __vec, const int __lane)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmax_u16 (uint16x4_t __a, uint16x4_t __b)
- {
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-+  return (uint16x4_t) __builtin_aarch64_umaxv4hi ((int16x4_t) __a,
-+						  (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vld1_lane_u32 (const uint32_t *__src, uint32x2_t __vec, const int __lane)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmax_u32 (uint32x2_t __a, uint32x2_t __b)
- {
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-+  return (uint32x2_t) __builtin_aarch64_umaxv2si ((int32x2_t) __a,
-+						  (int32x2_t) __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vld1_lane_u64 (const uint64_t *__src, uint64x1_t __vec, const int __lane)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxq_f32 (float32x4_t __a, float32x4_t __b)
- {
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-+  return __builtin_aarch64_smax_nanv4sf (__a, __b);
- }
- 
--/* vld1q_lane  */
--
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
--vld1q_lane_f16 (const float16_t *__src, float16x8_t __vec, const int __lane)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxq_f64 (float64x2_t __a, float64x2_t __b)
- {
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-+  return __builtin_aarch64_smax_nanv2df (__a, __b);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vld1q_lane_f32 (const float32_t *__src, float32x4_t __vec, const int __lane)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxq_s8 (int8x16_t __a, int8x16_t __b)
- {
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-+  return __builtin_aarch64_smaxv16qi (__a, __b);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vld1q_lane_f64 (const float64_t *__src, float64x2_t __vec, const int __lane)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxq_s16 (int16x8_t __a, int16x8_t __b)
- {
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-+  return __builtin_aarch64_smaxv8hi (__a, __b);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vld1q_lane_p8 (const poly8_t *__src, poly8x16_t __vec, const int __lane)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxq_s32 (int32x4_t __a, int32x4_t __b)
- {
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-+  return __builtin_aarch64_smaxv4si (__a, __b);
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vld1q_lane_p16 (const poly16_t *__src, poly16x8_t __vec, const int __lane)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-+  return (uint8x16_t) __builtin_aarch64_umaxv16qi ((int8x16_t) __a,
-+						   (int8x16_t) __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vld1q_lane_s8 (const int8_t *__src, int8x16_t __vec, const int __lane)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-+  return (uint16x8_t) __builtin_aarch64_umaxv8hi ((int16x8_t) __a,
-+						  (int16x8_t) __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vld1q_lane_s16 (const int16_t *__src, int16x8_t __vec, const int __lane)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-+  return (uint32x4_t) __builtin_aarch64_umaxv4si ((int32x4_t) __a,
-+						  (int32x4_t) __b);
- }
-+/* vmulx */
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vld1q_lane_s32 (const int32_t *__src, int32x4_t __vec, const int __lane)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulx_f32 (float32x2_t __a, float32x2_t __b)
- {
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-+  return __builtin_aarch64_fmulxv2sf (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vld1q_lane_s64 (const int64_t *__src, int64x2_t __vec, const int __lane)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxq_f32 (float32x4_t __a, float32x4_t __b)
- {
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-+  return __builtin_aarch64_fmulxv4sf (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vld1q_lane_u8 (const uint8_t *__src, uint8x16_t __vec, const int __lane)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulx_f64 (float64x1_t __a, float64x1_t __b)
- {
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-+  return (float64x1_t) {__builtin_aarch64_fmulxdf (__a[0], __b[0])};
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vld1q_lane_u16 (const uint16_t *__src, uint16x8_t __vec, const int __lane)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxq_f64 (float64x2_t __a, float64x2_t __b)
- {
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-+  return __builtin_aarch64_fmulxv2df (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vld1q_lane_u32 (const uint32_t *__src, uint32x4_t __vec, const int __lane)
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxs_f32 (float32_t __a, float32_t __b)
- {
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-+  return __builtin_aarch64_fmulxsf (__a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vld1q_lane_u64 (const uint64_t *__src, uint64x2_t __vec, const int __lane)
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxd_f64 (float64_t __a, float64_t __b)
- {
--  return __aarch64_vset_lane_any (*__src, __vec, __lane);
-+  return __builtin_aarch64_fmulxdf (__a, __b);
- }
- 
--/* vldn */
--
--__extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__))
--vld2_s64 (const int64_t * __a)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulx_lane_f32 (float32x2_t __a, float32x2_t __v, const int __lane)
- {
--  int64x1x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2di ((const __builtin_aarch64_simd_di *) __a);
--  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 0);
--  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 1);
--  return ret;
-+  return vmulx_f32 (__a, __aarch64_vdup_lane_f32 (__v, __lane));
- }
- 
--__extension__ static __inline uint64x1x2_t __attribute__ ((__always_inline__))
--vld2_u64 (const uint64_t * __a)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulx_lane_f64 (float64x1_t __a, float64x1_t __v, const int __lane)
- {
--  uint64x1x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2di ((const __builtin_aarch64_simd_di *) __a);
--  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 0);
--  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 1);
--  return ret;
-+  return vmulx_f64 (__a, __aarch64_vdup_lane_f64 (__v, __lane));
- }
- 
--__extension__ static __inline float64x1x2_t __attribute__ ((__always_inline__))
--vld2_f64 (const float64_t * __a)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxq_lane_f32 (float32x4_t __a, float32x2_t __v, const int __lane)
- {
--  float64x1x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2df ((const __builtin_aarch64_simd_df *) __a);
--  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 0)};
--  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 1)};
--  return ret;
-+  return vmulxq_f32 (__a, __aarch64_vdupq_lane_f32 (__v, __lane));
- }
- 
--__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
--vld2_s8 (const int8_t * __a)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxq_lane_f64 (float64x2_t __a, float64x1_t __v, const int __lane)
- {
--  int8x8x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2v8qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
--  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
--  return ret;
-+  return vmulxq_f64 (__a, __aarch64_vdupq_lane_f64 (__v, __lane));
- }
- 
--__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
--vld2_p8 (const poly8_t * __a)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulx_laneq_f32 (float32x2_t __a, float32x4_t __v, const int __lane)
- {
--  poly8x8x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2v8qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
--  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
--  return ret;
-+  return vmulx_f32 (__a, __aarch64_vdup_laneq_f32 (__v, __lane));
- }
- 
--__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
--vld2_s16 (const int16_t * __a)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulx_laneq_f64 (float64x1_t __a, float64x2_t __v, const int __lane)
- {
--  int16x4x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
--  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
--  return ret;
-+  return vmulx_f64 (__a, __aarch64_vdup_laneq_f64 (__v, __lane));
- }
- 
--__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
--vld2_p16 (const poly16_t * __a)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxq_laneq_f32 (float32x4_t __a, float32x4_t __v, const int __lane)
- {
--  poly16x4x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
--  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
--  return ret;
-+  return vmulxq_f32 (__a, __aarch64_vdupq_laneq_f32 (__v, __lane));
- }
- 
--__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
--vld2_s32 (const int32_t * __a)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxq_laneq_f64 (float64x2_t __a, float64x2_t __v, const int __lane)
- {
--  int32x2x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a);
--  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
--  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
--  return ret;
-+  return vmulxq_f64 (__a, __aarch64_vdupq_laneq_f64 (__v, __lane));
- }
- 
--__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
--vld2_u8 (const uint8_t * __a)
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxs_lane_f32 (float32_t __a, float32x2_t __v, const int __lane)
- {
--  uint8x8x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2v8qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
--  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
--  return ret;
-+  return vmulxs_f32 (__a, __aarch64_vget_lane_any (__v, __lane));
- }
- 
--__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
--vld2_u16 (const uint16_t * __a)
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxs_laneq_f32 (float32_t __a, float32x4_t __v, const int __lane)
- {
--  uint16x4x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
--  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
--  return ret;
-+  return vmulxs_f32 (__a, __aarch64_vget_lane_any (__v, __lane));
- }
- 
--__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
--vld2_u32 (const uint32_t * __a)
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxd_lane_f64 (float64_t __a, float64x1_t __v, const int __lane)
- {
--  uint32x2x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a);
--  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
--  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
--  return ret;
-+  return vmulxd_f64 (__a, __aarch64_vget_lane_any (__v, __lane));
- }
- 
--__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
--vld2_f16 (const float16_t * __a)
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxd_laneq_f64 (float64_t __a, float64x2_t __v, const int __lane)
- {
--  float16x4x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2v4hf (__a);
--  ret.val[0] = __builtin_aarch64_get_dregoiv4hf (__o, 0);
--  ret.val[1] = __builtin_aarch64_get_dregoiv4hf (__o, 1);
--  return ret;
-+  return vmulxd_f64 (__a, __aarch64_vget_lane_any (__v, __lane));
- }
- 
--__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
--vld2_f32 (const float32_t * __a)
-+/* vpmax  */
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmax_s8 (int8x8_t a, int8x8_t b)
- {
--  float32x2x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2v2sf ((const __builtin_aarch64_simd_sf *) __a);
--  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 0);
--  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 1);
--  return ret;
-+  return __builtin_aarch64_smaxpv8qi (a, b);
- }
- 
--__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
--vld2q_s8 (const int8_t * __a)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmax_s16 (int16x4_t a, int16x4_t b)
- {
--  int8x16x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
--  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
--  return ret;
-+  return __builtin_aarch64_smaxpv4hi (a, b);
- }
- 
--__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
--vld2q_p8 (const poly8_t * __a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmax_s32 (int32x2_t a, int32x2_t b)
- {
--  poly8x16x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
--  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
--  return ret;
-+  return __builtin_aarch64_smaxpv2si (a, b);
- }
- 
--__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
--vld2q_s16 (const int16_t * __a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmax_u8 (uint8x8_t a, uint8x8_t b)
- {
--  int16x8x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
--  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
--  return ret;
-+  return (uint8x8_t) __builtin_aarch64_umaxpv8qi ((int8x8_t) a,
-+						  (int8x8_t) b);
- }
- 
--__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
--vld2q_p16 (const poly16_t * __a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmax_u16 (uint16x4_t a, uint16x4_t b)
- {
--  poly16x8x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
--  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
--  return ret;
-+  return (uint16x4_t) __builtin_aarch64_umaxpv4hi ((int16x4_t) a,
-+						   (int16x4_t) b);
- }
- 
--__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
--vld2q_s32 (const int32_t * __a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmax_u32 (uint32x2_t a, uint32x2_t b)
- {
--  int32x4x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
--  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
--  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
--  return ret;
-+  return (uint32x2_t) __builtin_aarch64_umaxpv2si ((int32x2_t) a,
-+						   (int32x2_t) b);
- }
- 
--__extension__ static __inline int64x2x2_t __attribute__ ((__always_inline__))
--vld2q_s64 (const int64_t * __a)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxq_s8 (int8x16_t a, int8x16_t b)
- {
--  int64x2x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
--  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
--  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
--  return ret;
-+  return __builtin_aarch64_smaxpv16qi (a, b);
- }
- 
--__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
--vld2q_u8 (const uint8_t * __a)
--{
--  uint8x16x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
--  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
--  return ret;
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxq_s16 (int16x8_t a, int16x8_t b)
-+{
-+  return __builtin_aarch64_smaxpv8hi (a, b);
- }
- 
--__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
--vld2q_u16 (const uint16_t * __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxq_s32 (int32x4_t a, int32x4_t b)
- {
--  uint16x8x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
--  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
--  return ret;
-+  return __builtin_aarch64_smaxpv4si (a, b);
- }
- 
--__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
--vld2q_u32 (const uint32_t * __a)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxq_u8 (uint8x16_t a, uint8x16_t b)
- {
--  uint32x4x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
--  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
--  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
--  return ret;
-+  return (uint8x16_t) __builtin_aarch64_umaxpv16qi ((int8x16_t) a,
-+						    (int8x16_t) b);
- }
- 
--__extension__ static __inline uint64x2x2_t __attribute__ ((__always_inline__))
--vld2q_u64 (const uint64_t * __a)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxq_u16 (uint16x8_t a, uint16x8_t b)
- {
--  uint64x2x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
--  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
--  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
--  return ret;
-+  return (uint16x8_t) __builtin_aarch64_umaxpv8hi ((int16x8_t) a,
-+						   (int16x8_t) b);
- }
- 
--__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
--vld2q_f16 (const float16_t * __a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxq_u32 (uint32x4_t a, uint32x4_t b)
- {
--  float16x8x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2v8hf (__a);
--  ret.val[0] = __builtin_aarch64_get_qregoiv8hf (__o, 0);
--  ret.val[1] = __builtin_aarch64_get_qregoiv8hf (__o, 1);
--  return ret;
-+  return (uint32x4_t) __builtin_aarch64_umaxpv4si ((int32x4_t) a,
-+						   (int32x4_t) b);
- }
- 
--__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
--vld2q_f32 (const float32_t * __a)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmax_f32 (float32x2_t a, float32x2_t b)
- {
--  float32x4x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2v4sf ((const __builtin_aarch64_simd_sf *) __a);
--  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 0);
--  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 1);
--  return ret;
-+  return __builtin_aarch64_smax_nanpv2sf (a, b);
- }
- 
--__extension__ static __inline float64x2x2_t __attribute__ ((__always_inline__))
--vld2q_f64 (const float64_t * __a)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxq_f32 (float32x4_t a, float32x4_t b)
- {
--  float64x2x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2v2df ((const __builtin_aarch64_simd_df *) __a);
--  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 0);
--  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 1);
--  return ret;
-+  return __builtin_aarch64_smax_nanpv4sf (a, b);
- }
- 
--__extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__))
--vld3_s64 (const int64_t * __a)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxq_f64 (float64x2_t a, float64x2_t b)
- {
--  int64x1x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3di ((const __builtin_aarch64_simd_di *) __a);
--  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
--  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
--  ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
--  return ret;
-+  return __builtin_aarch64_smax_nanpv2df (a, b);
- }
- 
--__extension__ static __inline uint64x1x3_t __attribute__ ((__always_inline__))
--vld3_u64 (const uint64_t * __a)
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxqd_f64 (float64x2_t a)
- {
--  uint64x1x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3di ((const __builtin_aarch64_simd_di *) __a);
--  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
--  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
--  ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
--  return ret;
-+  return __builtin_aarch64_reduc_smax_nan_scal_v2df (a);
- }
- 
--__extension__ static __inline float64x1x3_t __attribute__ ((__always_inline__))
--vld3_f64 (const float64_t * __a)
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxs_f32 (float32x2_t a)
- {
--  float64x1x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3df ((const __builtin_aarch64_simd_df *) __a);
--  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 0)};
--  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 1)};
--  ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 2)};
--  return ret;
-+  return __builtin_aarch64_reduc_smax_nan_scal_v2sf (a);
- }
- 
--__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__))
--vld3_s8 (const int8_t * __a)
-+/* vpmaxnm  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxnm_f32 (float32x2_t a, float32x2_t b)
- {
--  int8x8x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3v8qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
--  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
--  ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
--  return ret;
-+  return __builtin_aarch64_smaxpv2sf (a, b);
- }
- 
--__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__))
--vld3_p8 (const poly8_t * __a)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxnmq_f32 (float32x4_t a, float32x4_t b)
- {
--  poly8x8x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3v8qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
--  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
--  ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
--  return ret;
-+  return __builtin_aarch64_smaxpv4sf (a, b);
- }
- 
--__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__))
--vld3_s16 (const int16_t * __a)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxnmq_f64 (float64x2_t a, float64x2_t b)
- {
--  int16x4x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3v4hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
--  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
--  ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
--  return ret;
-+  return __builtin_aarch64_smaxpv2df (a, b);
- }
- 
--__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__))
--vld3_p16 (const poly16_t * __a)
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxnmqd_f64 (float64x2_t a)
- {
--  poly16x4x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3v4hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
--  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
--  ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
--  return ret;
-+  return __builtin_aarch64_reduc_smax_scal_v2df (a);
- }
- 
--__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__))
--vld3_s32 (const int32_t * __a)
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxnms_f32 (float32x2_t a)
- {
--  int32x2x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3v2si ((const __builtin_aarch64_simd_si *) __a);
--  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0);
--  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1);
--  ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2);
--  return ret;
-+  return __builtin_aarch64_reduc_smax_scal_v2sf (a);
- }
- 
--__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__))
--vld3_u8 (const uint8_t * __a)
-+/* vpmin  */
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmin_s8 (int8x8_t a, int8x8_t b)
- {
--  uint8x8x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3v8qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
--  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
--  ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
--  return ret;
-+  return __builtin_aarch64_sminpv8qi (a, b);
- }
- 
--__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__))
--vld3_u16 (const uint16_t * __a)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmin_s16 (int16x4_t a, int16x4_t b)
- {
--  uint16x4x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3v4hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
--  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
--  ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
--  return ret;
-+  return __builtin_aarch64_sminpv4hi (a, b);
- }
- 
--__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__))
--vld3_u32 (const uint32_t * __a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmin_s32 (int32x2_t a, int32x2_t b)
- {
--  uint32x2x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3v2si ((const __builtin_aarch64_simd_si *) __a);
--  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0);
--  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1);
--  ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2);
--  return ret;
-+  return __builtin_aarch64_sminpv2si (a, b);
- }
- 
--__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
--vld3_f16 (const float16_t * __a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmin_u8 (uint8x8_t a, uint8x8_t b)
- {
--  float16x4x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3v4hf (__a);
--  ret.val[0] = __builtin_aarch64_get_dregciv4hf (__o, 0);
--  ret.val[1] = __builtin_aarch64_get_dregciv4hf (__o, 1);
--  ret.val[2] = __builtin_aarch64_get_dregciv4hf (__o, 2);
--  return ret;
-+  return (uint8x8_t) __builtin_aarch64_uminpv8qi ((int8x8_t) a,
-+						  (int8x8_t) b);
- }
- 
--__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
--vld3_f32 (const float32_t * __a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmin_u16 (uint16x4_t a, uint16x4_t b)
- {
--  float32x2x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3v2sf ((const __builtin_aarch64_simd_sf *) __a);
--  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 0);
--  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 1);
--  ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 2);
--  return ret;
-+  return (uint16x4_t) __builtin_aarch64_uminpv4hi ((int16x4_t) a,
-+						   (int16x4_t) b);
- }
- 
--__extension__ static __inline int8x16x3_t __attribute__ ((__always_inline__))
--vld3q_s8 (const int8_t * __a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmin_u32 (uint32x2_t a, uint32x2_t b)
- {
--  int8x16x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
--  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
--  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
--  return ret;
-+  return (uint32x2_t) __builtin_aarch64_uminpv2si ((int32x2_t) a,
-+						   (int32x2_t) b);
- }
- 
--__extension__ static __inline poly8x16x3_t __attribute__ ((__always_inline__))
--vld3q_p8 (const poly8_t * __a)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminq_s8 (int8x16_t a, int8x16_t b)
- {
--  poly8x16x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
--  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
--  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
--  return ret;
-+  return __builtin_aarch64_sminpv16qi (a, b);
- }
- 
--__extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__))
--vld3q_s16 (const int16_t * __a)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminq_s16 (int16x8_t a, int16x8_t b)
- {
--  int16x8x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
--  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
--  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
--  return ret;
-+  return __builtin_aarch64_sminpv8hi (a, b);
- }
- 
--__extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__))
--vld3q_p16 (const poly16_t * __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminq_s32 (int32x4_t a, int32x4_t b)
- {
--  poly16x8x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
--  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
--  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
--  return ret;
-+  return __builtin_aarch64_sminpv4si (a, b);
- }
- 
--__extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__))
--vld3q_s32 (const int32_t * __a)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminq_u8 (uint8x16_t a, uint8x16_t b)
- {
--  int32x4x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a);
--  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
--  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
--  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
--  return ret;
-+  return (uint8x16_t) __builtin_aarch64_uminpv16qi ((int8x16_t) a,
-+						    (int8x16_t) b);
- }
- 
--__extension__ static __inline int64x2x3_t __attribute__ ((__always_inline__))
--vld3q_s64 (const int64_t * __a)
--{
--  int64x2x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
--  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
--  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
--  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
--  return ret;
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminq_u16 (uint16x8_t a, uint16x8_t b)
-+{
-+  return (uint16x8_t) __builtin_aarch64_uminpv8hi ((int16x8_t) a,
-+						   (int16x8_t) b);
- }
- 
--__extension__ static __inline uint8x16x3_t __attribute__ ((__always_inline__))
--vld3q_u8 (const uint8_t * __a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminq_u32 (uint32x4_t a, uint32x4_t b)
- {
--  uint8x16x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
--  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
--  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
--  return ret;
-+  return (uint32x4_t) __builtin_aarch64_uminpv4si ((int32x4_t) a,
-+						   (int32x4_t) b);
- }
- 
--__extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__))
--vld3q_u16 (const uint16_t * __a)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmin_f32 (float32x2_t a, float32x2_t b)
- {
--  uint16x8x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
--  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
--  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
--  return ret;
-+  return __builtin_aarch64_smin_nanpv2sf (a, b);
- }
- 
--__extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__))
--vld3q_u32 (const uint32_t * __a)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminq_f32 (float32x4_t a, float32x4_t b)
- {
--  uint32x4x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a);
--  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
--  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
--  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
--  return ret;
-+  return __builtin_aarch64_smin_nanpv4sf (a, b);
- }
- 
--__extension__ static __inline uint64x2x3_t __attribute__ ((__always_inline__))
--vld3q_u64 (const uint64_t * __a)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminq_f64 (float64x2_t a, float64x2_t b)
- {
--  uint64x2x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
--  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
--  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
--  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
--  return ret;
-+  return __builtin_aarch64_smin_nanpv2df (a, b);
- }
- 
--__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__))
--vld3q_f16 (const float16_t * __a)
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminqd_f64 (float64x2_t a)
- {
--  float16x8x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3v8hf (__a);
--  ret.val[0] = __builtin_aarch64_get_qregciv8hf (__o, 0);
--  ret.val[1] = __builtin_aarch64_get_qregciv8hf (__o, 1);
--  ret.val[2] = __builtin_aarch64_get_qregciv8hf (__o, 2);
--  return ret;
-+  return __builtin_aarch64_reduc_smin_nan_scal_v2df (a);
- }
- 
--__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
--vld3q_f32 (const float32_t * __a)
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmins_f32 (float32x2_t a)
- {
--  float32x4x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3v4sf ((const __builtin_aarch64_simd_sf *) __a);
--  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 0);
--  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 1);
--  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 2);
--  return ret;
-+  return __builtin_aarch64_reduc_smin_nan_scal_v2sf (a);
- }
- 
--__extension__ static __inline float64x2x3_t __attribute__ ((__always_inline__))
--vld3q_f64 (const float64_t * __a)
-+/* vpminnm  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminnm_f32 (float32x2_t a, float32x2_t b)
- {
--  float64x2x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3v2df ((const __builtin_aarch64_simd_df *) __a);
--  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 0);
--  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 1);
--  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 2);
--  return ret;
-+  return __builtin_aarch64_sminpv2sf (a, b);
- }
- 
--__extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__))
--vld4_s64 (const int64_t * __a)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminnmq_f32 (float32x4_t a, float32x4_t b)
- {
--  int64x1x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4di ((const __builtin_aarch64_simd_di *) __a);
--  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 0);
--  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 1);
--  ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 2);
--  ret.val[3] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 3);
--  return ret;
-+  return __builtin_aarch64_sminpv4sf (a, b);
- }
- 
--__extension__ static __inline uint64x1x4_t __attribute__ ((__always_inline__))
--vld4_u64 (const uint64_t * __a)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminnmq_f64 (float64x2_t a, float64x2_t b)
- {
--  uint64x1x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4di ((const __builtin_aarch64_simd_di *) __a);
--  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 0);
--  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 1);
--  ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 2);
--  ret.val[3] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 3);
--  return ret;
-+  return __builtin_aarch64_sminpv2df (a, b);
- }
- 
--__extension__ static __inline float64x1x4_t __attribute__ ((__always_inline__))
--vld4_f64 (const float64_t * __a)
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminnmqd_f64 (float64x2_t a)
- {
--  float64x1x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4df ((const __builtin_aarch64_simd_df *) __a);
--  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 0)};
--  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 1)};
--  ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 2)};
--  ret.val[3] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 3)};
--  return ret;
-+  return __builtin_aarch64_reduc_smin_scal_v2df (a);
- }
- 
--__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__))
--vld4_s8 (const int8_t * __a)
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminnms_f32 (float32x2_t a)
- {
--  int8x8x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4v8qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
--  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
--  ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
--  ret.val[3] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
--  return ret;
-+  return __builtin_aarch64_reduc_smin_scal_v2sf (a);
- }
- 
--__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__))
--vld4_p8 (const poly8_t * __a)
-+/* vmaxnm  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxnm_f32 (float32x2_t __a, float32x2_t __b)
- {
--  poly8x8x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4v8qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
--  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
--  ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
--  ret.val[3] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
--  return ret;
-+  return __builtin_aarch64_fmaxv2sf (__a, __b);
- }
- 
--__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__))
--vld4_s16 (const int16_t * __a)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxnm_f64 (float64x1_t __a, float64x1_t __b)
- {
--  int16x4x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4v4hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
--  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
--  ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
--  ret.val[3] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
--  return ret;
-+  return (float64x1_t)
-+    { __builtin_aarch64_fmaxdf (vget_lane_f64 (__a, 0),
-+				vget_lane_f64 (__b, 0)) };
- }
- 
--__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__))
--vld4_p16 (const poly16_t * __a)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxnmq_f32 (float32x4_t __a, float32x4_t __b)
- {
--  poly16x4x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4v4hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
--  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
--  ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
--  ret.val[3] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
--  return ret;
-+  return __builtin_aarch64_fmaxv4sf (__a, __b);
- }
- 
--__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__))
--vld4_s32 (const int32_t * __a)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxnmq_f64 (float64x2_t __a, float64x2_t __b)
- {
--  int32x2x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4v2si ((const __builtin_aarch64_simd_si *) __a);
--  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0);
--  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1);
--  ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2);
--  ret.val[3] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3);
--  return ret;
-+  return __builtin_aarch64_fmaxv2df (__a, __b);
- }
- 
--__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__))
--vld4_u8 (const uint8_t * __a)
-+/* vmaxv  */
-+
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxv_f32 (float32x2_t __a)
- {
--  uint8x8x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4v8qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
--  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
--  ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
--  ret.val[3] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
--  return ret;
-+  return __builtin_aarch64_reduc_smax_nan_scal_v2sf (__a);
- }
- 
--__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__))
--vld4_u16 (const uint16_t * __a)
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxv_s8 (int8x8_t __a)
- {
--  uint16x4x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4v4hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
--  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
--  ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
--  ret.val[3] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
--  return ret;
-+  return __builtin_aarch64_reduc_smax_scal_v8qi (__a);
- }
- 
--__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__))
--vld4_u32 (const uint32_t * __a)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxv_s16 (int16x4_t __a)
- {
--  uint32x2x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4v2si ((const __builtin_aarch64_simd_si *) __a);
--  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0);
--  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1);
--  ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2);
--  ret.val[3] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3);
--  return ret;
-+  return __builtin_aarch64_reduc_smax_scal_v4hi (__a);
- }
- 
--__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
--vld4_f16 (const float16_t * __a)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxv_s32 (int32x2_t __a)
- {
--  float16x4x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4v4hf (__a);
--  ret.val[0] = __builtin_aarch64_get_dregxiv4hf (__o, 0);
--  ret.val[1] = __builtin_aarch64_get_dregxiv4hf (__o, 1);
--  ret.val[2] = __builtin_aarch64_get_dregxiv4hf (__o, 2);
--  ret.val[3] = __builtin_aarch64_get_dregxiv4hf (__o, 3);
--  return ret;
-+  return __builtin_aarch64_reduc_smax_scal_v2si (__a);
- }
- 
--__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
--vld4_f32 (const float32_t * __a)
-+__extension__ extern __inline uint8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxv_u8 (uint8x8_t __a)
- {
--  float32x2x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4v2sf ((const __builtin_aarch64_simd_sf *) __a);
--  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 0);
--  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 1);
--  ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 2);
--  ret.val[3] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 3);
--  return ret;
-+  return __builtin_aarch64_reduc_umax_scal_v8qi_uu (__a);
- }
- 
--__extension__ static __inline int8x16x4_t __attribute__ ((__always_inline__))
--vld4q_s8 (const int8_t * __a)
-+__extension__ extern __inline uint16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxv_u16 (uint16x4_t __a)
- {
--  int8x16x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
--  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
--  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
--  ret.val[3] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
--  return ret;
-+  return __builtin_aarch64_reduc_umax_scal_v4hi_uu (__a);
- }
- 
--__extension__ static __inline poly8x16x4_t __attribute__ ((__always_inline__))
--vld4q_p8 (const poly8_t * __a)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxv_u32 (uint32x2_t __a)
- {
--  poly8x16x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
--  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
--  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
--  ret.val[3] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
--  return ret;
-+  return __builtin_aarch64_reduc_umax_scal_v2si_uu (__a);
- }
- 
--__extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__))
--vld4q_s16 (const int16_t * __a)
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxvq_f32 (float32x4_t __a)
- {
--  int16x8x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
--  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
--  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
--  ret.val[3] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
--  return ret;
-+  return __builtin_aarch64_reduc_smax_nan_scal_v4sf (__a);
- }
- 
--__extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__))
--vld4q_p16 (const poly16_t * __a)
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxvq_f64 (float64x2_t __a)
- {
--  poly16x8x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
--  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
--  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
--  ret.val[3] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
--  return ret;
-+  return __builtin_aarch64_reduc_smax_nan_scal_v2df (__a);
- }
- 
--__extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__))
--vld4q_s32 (const int32_t * __a)
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxvq_s8 (int8x16_t __a)
- {
--  int32x4x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
--  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0);
--  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1);
--  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
--  ret.val[3] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
--  return ret;
-+  return __builtin_aarch64_reduc_smax_scal_v16qi (__a);
- }
- 
--__extension__ static __inline int64x2x4_t __attribute__ ((__always_inline__))
--vld4q_s64 (const int64_t * __a)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxvq_s16 (int16x8_t __a)
- {
--  int64x2x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
--  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0);
--  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1);
--  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
--  ret.val[3] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
--  return ret;
-+  return __builtin_aarch64_reduc_smax_scal_v8hi (__a);
- }
- 
--__extension__ static __inline uint8x16x4_t __attribute__ ((__always_inline__))
--vld4q_u8 (const uint8_t * __a)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxvq_s32 (int32x4_t __a)
- {
--  uint8x16x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
--  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
--  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
--  ret.val[3] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
--  return ret;
-+  return __builtin_aarch64_reduc_smax_scal_v4si (__a);
- }
- 
--__extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__))
--vld4q_u16 (const uint16_t * __a)
-+__extension__ extern __inline uint8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxvq_u8 (uint8x16_t __a)
- {
--  uint16x8x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
--  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
--  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
--  ret.val[3] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
--  return ret;
-+  return __builtin_aarch64_reduc_umax_scal_v16qi_uu (__a);
- }
- 
--__extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__))
--vld4q_u32 (const uint32_t * __a)
-+__extension__ extern __inline uint16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxvq_u16 (uint16x8_t __a)
- {
--  uint32x4x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
--  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0);
--  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1);
--  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
--  ret.val[3] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
--  return ret;
-+  return __builtin_aarch64_reduc_umax_scal_v8hi_uu (__a);
- }
- 
--__extension__ static __inline uint64x2x4_t __attribute__ ((__always_inline__))
--vld4q_u64 (const uint64_t * __a)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxvq_u32 (uint32x4_t __a)
- {
--  uint64x2x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
--  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0);
--  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1);
--  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
--  ret.val[3] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
--  return ret;
-+  return __builtin_aarch64_reduc_umax_scal_v4si_uu (__a);
- }
- 
--__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__))
--vld4q_f16 (const float16_t * __a)
-+/* vmaxnmv  */
-+
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxnmv_f32 (float32x2_t __a)
- {
--  float16x8x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4v8hf (__a);
--  ret.val[0] = __builtin_aarch64_get_qregxiv8hf (__o, 0);
--  ret.val[1] = __builtin_aarch64_get_qregxiv8hf (__o, 1);
--  ret.val[2] = __builtin_aarch64_get_qregxiv8hf (__o, 2);
--  ret.val[3] = __builtin_aarch64_get_qregxiv8hf (__o, 3);
--  return ret;
-+  return __builtin_aarch64_reduc_smax_scal_v2sf (__a);
- }
- 
--__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
--vld4q_f32 (const float32_t * __a)
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxnmvq_f32 (float32x4_t __a)
- {
--  float32x4x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4v4sf ((const __builtin_aarch64_simd_sf *) __a);
--  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 0);
--  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 1);
--  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 2);
--  ret.val[3] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 3);
--  return ret;
-+  return __builtin_aarch64_reduc_smax_scal_v4sf (__a);
- }
- 
--__extension__ static __inline float64x2x4_t __attribute__ ((__always_inline__))
--vld4q_f64 (const float64_t * __a)
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxnmvq_f64 (float64x2_t __a)
- {
--  float64x2x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4v2df ((const __builtin_aarch64_simd_df *) __a);
--  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 0);
--  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 1);
--  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 2);
--  ret.val[3] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 3);
--  return ret;
-+  return __builtin_aarch64_reduc_smax_scal_v2df (__a);
- }
- 
--/* vldn_dup */
-+/* vmin  */
- 
--__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
--vld2_dup_s8 (const int8_t * __a)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmin_f32 (float32x2_t __a, float32x2_t __b)
- {
--  int8x8x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rv8qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
--  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
--  return ret;
-+  return __builtin_aarch64_smin_nanv2sf (__a, __b);
- }
- 
--__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
--vld2_dup_s16 (const int16_t * __a)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmin_f64 (float64x1_t __a, float64x1_t __b)
- {
--  int16x4x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rv4hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
--  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
--  return ret;
-+    return (float64x1_t)
-+	  { __builtin_aarch64_smin_nandf (vget_lane_f64 (__a, 0),
-+					  vget_lane_f64 (__b, 0)) };
- }
- 
--__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
--vld2_dup_s32 (const int32_t * __a)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmin_s8 (int8x8_t __a, int8x8_t __b)
- {
--  int32x2x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rv2si ((const __builtin_aarch64_simd_si *) __a);
--  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
--  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
--  return ret;
-+  return __builtin_aarch64_sminv8qi (__a, __b);
- }
- 
--__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
--vld2_dup_f16 (const float16_t * __a)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmin_s16 (int16x4_t __a, int16x4_t __b)
- {
--  float16x4x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rv4hf ((const __builtin_aarch64_simd_hf *) __a);
--  ret.val[0] = __builtin_aarch64_get_dregoiv4hf (__o, 0);
--  ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregoiv4hf (__o, 1);
--  return ret;
-+  return __builtin_aarch64_sminv4hi (__a, __b);
- }
- 
--__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
--vld2_dup_f32 (const float32_t * __a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmin_s32 (int32x2_t __a, int32x2_t __b)
- {
--  float32x2x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rv2sf ((const __builtin_aarch64_simd_sf *) __a);
--  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 0);
--  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 1);
--  return ret;
-+  return __builtin_aarch64_sminv2si (__a, __b);
- }
- 
--__extension__ static __inline float64x1x2_t __attribute__ ((__always_inline__))
--vld2_dup_f64 (const float64_t * __a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmin_u8 (uint8x8_t __a, uint8x8_t __b)
- {
--  float64x1x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rdf ((const __builtin_aarch64_simd_df *) __a);
--  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 0)};
--  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 1)};
--  return ret;
-+  return (uint8x8_t) __builtin_aarch64_uminv8qi ((int8x8_t) __a,
-+						 (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
--vld2_dup_u8 (const uint8_t * __a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmin_u16 (uint16x4_t __a, uint16x4_t __b)
- {
--  uint8x8x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rv8qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
--  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
--  return ret;
-+  return (uint16x4_t) __builtin_aarch64_uminv4hi ((int16x4_t) __a,
-+						  (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
--vld2_dup_u16 (const uint16_t * __a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmin_u32 (uint32x2_t __a, uint32x2_t __b)
- {
--  uint16x4x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rv4hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
--  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
--  return ret;
-+  return (uint32x2_t) __builtin_aarch64_uminv2si ((int32x2_t) __a,
-+						  (int32x2_t) __b);
- }
- 
--__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
--vld2_dup_u32 (const uint32_t * __a)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminq_f32 (float32x4_t __a, float32x4_t __b)
- {
--  uint32x2x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rv2si ((const __builtin_aarch64_simd_si *) __a);
--  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
--  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
--  return ret;
-+  return __builtin_aarch64_smin_nanv4sf (__a, __b);
- }
- 
--__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
--vld2_dup_p8 (const poly8_t * __a)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminq_f64 (float64x2_t __a, float64x2_t __b)
- {
--  poly8x8x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rv8qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0);
--  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1);
--  return ret;
-+  return __builtin_aarch64_smin_nanv2df (__a, __b);
- }
- 
--__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
--vld2_dup_p16 (const poly16_t * __a)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminq_s8 (int8x16_t __a, int8x16_t __b)
- {
--  poly16x4x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rv4hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
--  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
--  return ret;
-+  return __builtin_aarch64_sminv16qi (__a, __b);
- }
- 
--__extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__))
--vld2_dup_s64 (const int64_t * __a)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminq_s16 (int16x8_t __a, int16x8_t __b)
- {
--  int64x1x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rdi ((const __builtin_aarch64_simd_di *) __a);
--  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 0);
--  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 1);
--  return ret;
-+  return __builtin_aarch64_sminv8hi (__a, __b);
- }
- 
--__extension__ static __inline uint64x1x2_t __attribute__ ((__always_inline__))
--vld2_dup_u64 (const uint64_t * __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminq_s32 (int32x4_t __a, int32x4_t __b)
- {
--  uint64x1x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rdi ((const __builtin_aarch64_simd_di *) __a);
--  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 0);
--  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 1);
--  return ret;
-+  return __builtin_aarch64_sminv4si (__a, __b);
- }
- 
--__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
--vld2q_dup_s8 (const int8_t * __a)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
--  int8x16x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rv16qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
--  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
--  return ret;
-+  return (uint8x16_t) __builtin_aarch64_uminv16qi ((int8x16_t) __a,
-+						   (int8x16_t) __b);
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminq_u16 (uint16x8_t __a, uint16x8_t __b)
-+{
-+  return (uint16x8_t) __builtin_aarch64_uminv8hi ((int16x8_t) __a,
-+						  (int16x8_t) __b);
- }
- 
--__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
--vld2q_dup_p8 (const poly8_t * __a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminq_u32 (uint32x4_t __a, uint32x4_t __b)
-+{
-+  return (uint32x4_t) __builtin_aarch64_uminv4si ((int32x4_t) __a,
-+						  (int32x4_t) __b);
-+}
-+
-+/* vminnm  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminnm_f32 (float32x2_t __a, float32x2_t __b)
- {
--  poly8x16x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rv16qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
--  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
--  return ret;
-+  return __builtin_aarch64_fminv2sf (__a, __b);
- }
- 
--__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
--vld2q_dup_s16 (const int16_t * __a)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminnm_f64 (float64x1_t __a, float64x1_t __b)
- {
--  int16x8x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rv8hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
--  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
--  return ret;
-+  return (float64x1_t)
-+    { __builtin_aarch64_fmindf (vget_lane_f64 (__a, 0),
-+				vget_lane_f64 (__b, 0)) };
- }
- 
--__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
--vld2q_dup_p16 (const poly16_t * __a)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminnmq_f32 (float32x4_t __a, float32x4_t __b)
- {
--  poly16x8x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rv8hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
--  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
--  return ret;
-+  return __builtin_aarch64_fminv4sf (__a, __b);
- }
- 
--__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
--vld2q_dup_s32 (const int32_t * __a)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminnmq_f64 (float64x2_t __a, float64x2_t __b)
- {
--  int32x4x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rv4si ((const __builtin_aarch64_simd_si *) __a);
--  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
--  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
--  return ret;
-+  return __builtin_aarch64_fminv2df (__a, __b);
- }
- 
--__extension__ static __inline int64x2x2_t __attribute__ ((__always_inline__))
--vld2q_dup_s64 (const int64_t * __a)
-+/* vminv  */
-+
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminv_f32 (float32x2_t __a)
- {
--  int64x2x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a);
--  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
--  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
--  return ret;
-+  return __builtin_aarch64_reduc_smin_nan_scal_v2sf (__a);
- }
- 
--__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
--vld2q_dup_u8 (const uint8_t * __a)
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminv_s8 (int8x8_t __a)
- {
--  uint8x16x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rv16qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
--  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
--  return ret;
-+  return __builtin_aarch64_reduc_smin_scal_v8qi (__a);
- }
- 
--__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
--vld2q_dup_u16 (const uint16_t * __a)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminv_s16 (int16x4_t __a)
- {
--  uint16x8x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rv8hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
--  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
--  return ret;
-+  return __builtin_aarch64_reduc_smin_scal_v4hi (__a);
- }
- 
--__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
--vld2q_dup_u32 (const uint32_t * __a)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminv_s32 (int32x2_t __a)
- {
--  uint32x4x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rv4si ((const __builtin_aarch64_simd_si *) __a);
--  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
--  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
--  return ret;
-+  return __builtin_aarch64_reduc_smin_scal_v2si (__a);
- }
- 
--__extension__ static __inline uint64x2x2_t __attribute__ ((__always_inline__))
--vld2q_dup_u64 (const uint64_t * __a)
-+__extension__ extern __inline uint8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminv_u8 (uint8x8_t __a)
- {
--  uint64x2x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a);
--  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
--  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
--  return ret;
-+  return __builtin_aarch64_reduc_umin_scal_v8qi_uu (__a);
- }
- 
--__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
--vld2q_dup_f16 (const float16_t * __a)
-+__extension__ extern __inline uint16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminv_u16 (uint16x4_t __a)
- {
--  float16x8x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rv8hf ((const __builtin_aarch64_simd_hf *) __a);
--  ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregoiv8hf (__o, 0);
--  ret.val[1] = __builtin_aarch64_get_qregoiv8hf (__o, 1);
--  return ret;
-+  return __builtin_aarch64_reduc_umin_scal_v4hi_uu (__a);
- }
- 
--__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
--vld2q_dup_f32 (const float32_t * __a)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminv_u32 (uint32x2_t __a)
- {
--  float32x4x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rv4sf ((const __builtin_aarch64_simd_sf *) __a);
--  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 0);
--  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 1);
--  return ret;
-+  return __builtin_aarch64_reduc_umin_scal_v2si_uu (__a);
- }
- 
--__extension__ static __inline float64x2x2_t __attribute__ ((__always_inline__))
--vld2q_dup_f64 (const float64_t * __a)
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminvq_f32 (float32x4_t __a)
- {
--  float64x2x2_t ret;
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_ld2rv2df ((const __builtin_aarch64_simd_df *) __a);
--  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 0);
--  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 1);
--  return ret;
-+  return __builtin_aarch64_reduc_smin_nan_scal_v4sf (__a);
- }
- 
--__extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__))
--vld3_dup_s64 (const int64_t * __a)
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminvq_f64 (float64x2_t __a)
- {
--  int64x1x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rdi ((const __builtin_aarch64_simd_di *) __a);
--  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
--  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
--  ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
--  return ret;
-+  return __builtin_aarch64_reduc_smin_nan_scal_v2df (__a);
- }
- 
--__extension__ static __inline uint64x1x3_t __attribute__ ((__always_inline__))
--vld3_dup_u64 (const uint64_t * __a)
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminvq_s8 (int8x16_t __a)
- {
--  uint64x1x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rdi ((const __builtin_aarch64_simd_di *) __a);
--  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
--  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
--  ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
--  return ret;
-+  return __builtin_aarch64_reduc_smin_scal_v16qi (__a);
- }
- 
--__extension__ static __inline float64x1x3_t __attribute__ ((__always_inline__))
--vld3_dup_f64 (const float64_t * __a)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminvq_s16 (int16x8_t __a)
- {
--  float64x1x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rdf ((const __builtin_aarch64_simd_df *) __a);
--  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 0)};
--  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 1)};
--  ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 2)};
--  return ret;
-+  return __builtin_aarch64_reduc_smin_scal_v8hi (__a);
- }
- 
--__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__))
--vld3_dup_s8 (const int8_t * __a)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminvq_s32 (int32x4_t __a)
- {
--  int8x8x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rv8qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
--  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
--  ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
--  return ret;
-+  return __builtin_aarch64_reduc_smin_scal_v4si (__a);
- }
- 
--__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__))
--vld3_dup_p8 (const poly8_t * __a)
-+__extension__ extern __inline uint8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminvq_u8 (uint8x16_t __a)
- {
--  poly8x8x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rv8qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
--  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
--  ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
--  return ret;
-+  return __builtin_aarch64_reduc_umin_scal_v16qi_uu (__a);
- }
- 
--__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__))
--vld3_dup_s16 (const int16_t * __a)
-+__extension__ extern __inline uint16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminvq_u16 (uint16x8_t __a)
- {
--  int16x4x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rv4hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
--  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
--  ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
--  return ret;
-+  return __builtin_aarch64_reduc_umin_scal_v8hi_uu (__a);
- }
- 
--__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__))
--vld3_dup_p16 (const poly16_t * __a)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminvq_u32 (uint32x4_t __a)
- {
--  poly16x4x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rv4hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
--  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
--  ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
--  return ret;
-+  return __builtin_aarch64_reduc_umin_scal_v4si_uu (__a);
- }
- 
--__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__))
--vld3_dup_s32 (const int32_t * __a)
-+/* vminnmv  */
-+
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminnmv_f32 (float32x2_t __a)
- {
--  int32x2x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rv2si ((const __builtin_aarch64_simd_si *) __a);
--  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0);
--  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1);
--  ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2);
--  return ret;
-+  return __builtin_aarch64_reduc_smin_scal_v2sf (__a);
- }
- 
--__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__))
--vld3_dup_u8 (const uint8_t * __a)
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminnmvq_f32 (float32x4_t __a)
- {
--  uint8x8x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rv8qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0);
--  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1);
--  ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2);
--  return ret;
-+  return __builtin_aarch64_reduc_smin_scal_v4sf (__a);
- }
- 
--__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__))
--vld3_dup_u16 (const uint16_t * __a)
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminnmvq_f64 (float64x2_t __a)
- {
--  uint16x4x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rv4hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0);
--  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1);
--  ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2);
--  return ret;
-+  return __builtin_aarch64_reduc_smin_scal_v2df (__a);
- }
- 
--__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__))
--vld3_dup_u32 (const uint32_t * __a)
-+/* vmla */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
- {
--  uint32x2x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rv2si ((const __builtin_aarch64_simd_si *) __a);
--  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0);
--  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1);
--  ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2);
--  return ret;
-+  return a + b * c;
- }
- 
--__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
--vld3_dup_f16 (const float16_t * __a)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
- {
--  float16x4x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rv4hf ((const __builtin_aarch64_simd_hf *) __a);
--  ret.val[0] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 0);
--  ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 1);
--  ret.val[2] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 2);
--  return ret;
-+  return __a + __b * __c;
- }
- 
--__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
--vld3_dup_f32 (const float32_t * __a)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
- {
--  float32x2x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rv2sf ((const __builtin_aarch64_simd_sf *) __a);
--  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 0);
--  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 1);
--  ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 2);
--  return ret;
-+  return a + b * c;
- }
- 
--__extension__ static __inline int8x16x3_t __attribute__ ((__always_inline__))
--vld3q_dup_s8 (const int8_t * __a)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
- {
--  int8x16x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rv16qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
--  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
--  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
--  return ret;
-+  return a + b * c;
- }
- 
--__extension__ static __inline poly8x16x3_t __attribute__ ((__always_inline__))
--vld3q_dup_p8 (const poly8_t * __a)
-+/* vmla_lane  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_lane_f32 (float32x2_t __a, float32x2_t __b,
-+	       float32x2_t __c, const int __lane)
- {
--  poly8x16x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rv16qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
--  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
--  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
--  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__))
--vld3q_dup_s16 (const int16_t * __a)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_lane_s16 (int16x4_t __a, int16x4_t __b,
-+		int16x4_t __c, const int __lane)
- {
--  int16x8x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rv8hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
--  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
--  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
--  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__))
--vld3q_dup_p16 (const poly16_t * __a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_lane_s32 (int32x2_t __a, int32x2_t __b,
-+		int32x2_t __c, const int __lane)
- {
--  poly16x8x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rv8hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
--  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
--  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
--  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__))
--vld3q_dup_s32 (const int32_t * __a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_lane_u16 (uint16x4_t __a, uint16x4_t __b,
-+		uint16x4_t __c, const int __lane)
- {
--  int32x4x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rv4si ((const __builtin_aarch64_simd_si *) __a);
--  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
--  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
--  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
--  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline int64x2x3_t __attribute__ ((__always_inline__))
--vld3q_dup_s64 (const int64_t * __a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_lane_u32 (uint32x2_t __a, uint32x2_t __b,
-+	       uint32x2_t __c, const int __lane)
- {
--  int64x2x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a);
--  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
--  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
--  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
--  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline uint8x16x3_t __attribute__ ((__always_inline__))
--vld3q_dup_u8 (const uint8_t * __a)
-+/* vmla_laneq  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_laneq_f32 (float32x2_t __a, float32x2_t __b,
-+	        float32x4_t __c, const int __lane)
- {
--  uint8x16x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rv16qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
--  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
--  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
--  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__))
--vld3q_dup_u16 (const uint16_t * __a)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_laneq_s16 (int16x4_t __a, int16x4_t __b,
-+		int16x8_t __c, const int __lane)
- {
--  uint16x8x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rv8hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
--  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
--  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
--  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__))
--vld3q_dup_u32 (const uint32_t * __a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_laneq_s32 (int32x2_t __a, int32x2_t __b,
-+		int32x4_t __c, const int __lane)
- {
--  uint32x4x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rv4si ((const __builtin_aarch64_simd_si *) __a);
--  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
--  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
--  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
--  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline uint64x2x3_t __attribute__ ((__always_inline__))
--vld3q_dup_u64 (const uint64_t * __a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_laneq_u16 (uint16x4_t __a, uint16x4_t __b,
-+		uint16x8_t __c, const int __lane)
- {
--  uint64x2x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a);
--  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
--  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
--  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
--  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__))
--vld3q_dup_f16 (const float16_t * __a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmla_laneq_u32 (uint32x2_t __a, uint32x2_t __b,
-+		uint32x4_t __c, const int __lane)
- {
--  float16x8x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rv8hf ((const __builtin_aarch64_simd_hf *) __a);
--  ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 0);
--  ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 1);
--  ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 2);
--  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
--vld3q_dup_f32 (const float32_t * __a)
-+/* vmlaq_lane  */
-+
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_lane_f32 (float32x4_t __a, float32x4_t __b,
-+		float32x2_t __c, const int __lane)
- {
--  float32x4x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rv4sf ((const __builtin_aarch64_simd_sf *) __a);
--  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 0);
--  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 1);
--  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 2);
--  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline float64x2x3_t __attribute__ ((__always_inline__))
--vld3q_dup_f64 (const float64_t * __a)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_lane_s16 (int16x8_t __a, int16x8_t __b,
-+		int16x4_t __c, const int __lane)
- {
--  float64x2x3_t ret;
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_ld3rv2df ((const __builtin_aarch64_simd_df *) __a);
--  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 0);
--  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 1);
--  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 2);
--  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__))
--vld4_dup_s64 (const int64_t * __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_lane_s32 (int32x4_t __a, int32x4_t __b,
-+		int32x2_t __c, const int __lane)
- {
--  int64x1x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rdi ((const __builtin_aarch64_simd_di *) __a);
--  ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 0);
--  ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 1);
--  ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 2);
--  ret.val[3] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 3);
--  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline uint64x1x4_t __attribute__ ((__always_inline__))
--vld4_dup_u64 (const uint64_t * __a)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_lane_u16 (uint16x8_t __a, uint16x8_t __b,
-+		uint16x4_t __c, const int __lane)
- {
--  uint64x1x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rdi ((const __builtin_aarch64_simd_di *) __a);
--  ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 0);
--  ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 1);
--  ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 2);
--  ret.val[3] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 3);
--  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline float64x1x4_t __attribute__ ((__always_inline__))
--vld4_dup_f64 (const float64_t * __a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_lane_u32 (uint32x4_t __a, uint32x4_t __b,
-+		uint32x2_t __c, const int __lane)
- {
--  float64x1x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rdf ((const __builtin_aarch64_simd_df *) __a);
--  ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 0)};
--  ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 1)};
--  ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 2)};
--  ret.val[3] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 3)};
--  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__))
--vld4_dup_s8 (const int8_t * __a)
-+  /* vmlaq_laneq  */
-+
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_laneq_f32 (float32x4_t __a, float32x4_t __b,
-+		 float32x4_t __c, const int __lane)
- {
--  int8x8x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rv8qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
--  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
--  ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
--  ret.val[3] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
--  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__))
--vld4_dup_p8 (const poly8_t * __a)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_laneq_s16 (int16x8_t __a, int16x8_t __b,
-+		int16x8_t __c, const int __lane)
- {
--  poly8x8x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rv8qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
--  ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
--  ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
--  ret.val[3] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
--  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__))
--vld4_dup_s16 (const int16_t * __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_laneq_s32 (int32x4_t __a, int32x4_t __b,
-+		int32x4_t __c, const int __lane)
- {
--  int16x4x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rv4hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
--  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
--  ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
--  ret.val[3] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
--  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__))
--vld4_dup_p16 (const poly16_t * __a)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_laneq_u16 (uint16x8_t __a, uint16x8_t __b,
-+		uint16x8_t __c, const int __lane)
- {
--  poly16x4x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rv4hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
--  ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
--  ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
--  ret.val[3] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
--  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__))
--vld4_dup_s32 (const int32_t * __a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlaq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,
-+		uint32x4_t __c, const int __lane)
- {
--  int32x2x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rv2si ((const __builtin_aarch64_simd_si *) __a);
--  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0);
--  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1);
--  ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2);
--  ret.val[3] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3);
--  return ret;
-+  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__))
--vld4_dup_u8 (const uint8_t * __a)
-+/* vmls  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
- {
--  uint8x8x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rv8qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
--  ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
--  ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
--  ret.val[3] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
--  return ret;
-+  return a - b * c;
- }
- 
--__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__))
--vld4_dup_u16 (const uint16_t * __a)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
- {
--  uint16x4x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rv4hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
--  ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
--  ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
--  ret.val[3] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
--  return ret;
-+  return __a - __b * __c;
- }
- 
--__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__))
--vld4_dup_u32 (const uint32_t * __a)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
- {
--  uint32x2x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rv2si ((const __builtin_aarch64_simd_si *) __a);
--  ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0);
--  ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1);
--  ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2);
--  ret.val[3] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3);
--  return ret;
-+  return a - b * c;
- }
- 
--__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
--vld4_dup_f16 (const float16_t * __a)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
- {
--  float16x4x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rv4hf ((const __builtin_aarch64_simd_hf *) __a);
--  ret.val[0] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 0);
--  ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 1);
--  ret.val[2] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 2);
--  ret.val[3] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 3);
--  return ret;
-+  return a - b * c;
- }
- 
--__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
--vld4_dup_f32 (const float32_t * __a)
-+/* vmls_lane  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_lane_f32 (float32x2_t __a, float32x2_t __b,
-+	       float32x2_t __c, const int __lane)
- {
--  float32x2x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rv2sf ((const __builtin_aarch64_simd_sf *) __a);
--  ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 0);
--  ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 1);
--  ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 2);
--  ret.val[3] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 3);
--  return ret;
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline int8x16x4_t __attribute__ ((__always_inline__))
--vld4q_dup_s8 (const int8_t * __a)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_lane_s16 (int16x4_t __a, int16x4_t __b,
-+		int16x4_t __c, const int __lane)
- {
--  int8x16x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rv16qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
--  ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
--  ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
--  ret.val[3] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
--  return ret;
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline poly8x16x4_t __attribute__ ((__always_inline__))
--vld4q_dup_p8 (const poly8_t * __a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_lane_s32 (int32x2_t __a, int32x2_t __b,
-+		int32x2_t __c, const int __lane)
- {
--  poly8x16x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rv16qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
--  ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
--  ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
--  ret.val[3] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
--  return ret;
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__))
--vld4q_dup_s16 (const int16_t * __a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_lane_u16 (uint16x4_t __a, uint16x4_t __b,
-+		uint16x4_t __c, const int __lane)
- {
--  int16x8x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rv8hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
--  ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
--  ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
--  ret.val[3] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
--  return ret;
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__))
--vld4q_dup_p16 (const poly16_t * __a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_lane_u32 (uint32x2_t __a, uint32x2_t __b,
-+	       uint32x2_t __c, const int __lane)
- {
--  poly16x8x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rv8hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
--  ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
--  ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
--  ret.val[3] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
--  return ret;
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__))
--vld4q_dup_s32 (const int32_t * __a)
-+/* vmls_laneq  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_laneq_f32 (float32x2_t __a, float32x2_t __b,
-+	       float32x4_t __c, const int __lane)
- {
--  int32x4x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rv4si ((const __builtin_aarch64_simd_si *) __a);
--  ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0);
--  ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1);
--  ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
--  ret.val[3] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
--  return ret;
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline int64x2x4_t __attribute__ ((__always_inline__))
--vld4q_dup_s64 (const int64_t * __a)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_laneq_s16 (int16x4_t __a, int16x4_t __b,
-+		int16x8_t __c, const int __lane)
- {
--  int64x2x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a);
--  ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0);
--  ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1);
--  ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
--  ret.val[3] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
--  return ret;
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline uint8x16x4_t __attribute__ ((__always_inline__))
--vld4q_dup_u8 (const uint8_t * __a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_laneq_s32 (int32x2_t __a, int32x2_t __b,
-+		int32x4_t __c, const int __lane)
- {
--  uint8x16x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rv16qi ((const __builtin_aarch64_simd_qi *) __a);
--  ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
--  ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
--  ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
--  ret.val[3] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
--  return ret;
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__))
--vld4q_dup_u16 (const uint16_t * __a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_laneq_u16 (uint16x4_t __a, uint16x4_t __b,
-+		uint16x8_t __c, const int __lane)
- {
--  uint16x8x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rv8hi ((const __builtin_aarch64_simd_hi *) __a);
--  ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
--  ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
--  ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
--  ret.val[3] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
--  return ret;
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__))
--vld4q_dup_u32 (const uint32_t * __a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmls_laneq_u32 (uint32x2_t __a, uint32x2_t __b,
-+		uint32x4_t __c, const int __lane)
- {
--  uint32x4x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rv4si ((const __builtin_aarch64_simd_si *) __a);
--  ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0);
--  ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1);
--  ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
--  ret.val[3] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
--  return ret;
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline uint64x2x4_t __attribute__ ((__always_inline__))
--vld4q_dup_u64 (const uint64_t * __a)
-+/* vmlsq_lane  */
-+
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_lane_f32 (float32x4_t __a, float32x4_t __b,
-+		float32x2_t __c, const int __lane)
- {
--  uint64x2x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a);
--  ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0);
--  ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1);
--  ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
--  ret.val[3] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
--  return ret;
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__))
--vld4q_dup_f16 (const float16_t * __a)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_lane_s16 (int16x8_t __a, int16x8_t __b,
-+		int16x4_t __c, const int __lane)
- {
--  float16x8x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rv8hf ((const __builtin_aarch64_simd_hf *) __a);
--  ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 0);
--  ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 1);
--  ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 2);
--  ret.val[3] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 3);
--  return ret;
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
--vld4q_dup_f32 (const float32_t * __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_lane_s32 (int32x4_t __a, int32x4_t __b,
-+		int32x2_t __c, const int __lane)
- {
--  float32x4x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rv4sf ((const __builtin_aarch64_simd_sf *) __a);
--  ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 0);
--  ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 1);
--  ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 2);
--  ret.val[3] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 3);
--  return ret;
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__extension__ static __inline float64x2x4_t __attribute__ ((__always_inline__))
--vld4q_dup_f64 (const float64_t * __a)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_lane_u16 (uint16x8_t __a, uint16x8_t __b,
-+		uint16x4_t __c, const int __lane)
- {
--  float64x2x4_t ret;
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_ld4rv2df ((const __builtin_aarch64_simd_df *) __a);
--  ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 0);
--  ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 1);
--  ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 2);
--  ret.val[3] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 3);
--  return ret;
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--/* vld2_lane */
--
--#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype, mode,	   \
--			 qmode, ptrmode, funcsuffix, signedtype)	   \
--__extension__ static __inline intype __attribute__ ((__always_inline__))   \
--vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
--{									   \
--  __builtin_aarch64_simd_oi __o;					   \
--  largetype __temp;							   \
--  __temp.val[0] =							   \
--    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));	   \
--  __temp.val[1] =							   \
--    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));	   \
--  __o = __builtin_aarch64_set_qregoi##qmode (__o,			   \
--					    (signedtype) __temp.val[0],	   \
--					    0);				   \
--  __o = __builtin_aarch64_set_qregoi##qmode (__o,			   \
--					    (signedtype) __temp.val[1],	   \
--					    1);				   \
--  __o =	__builtin_aarch64_ld2_lane##mode (				   \
--	  (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);	   \
--  __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0);	   \
--  __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1);	   \
--  return __b;								   \
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_lane_u32 (uint32x4_t __a, uint32x4_t __b,
-+		uint32x2_t __c, const int __lane)
-+{
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__LD2_LANE_FUNC (float16x4x2_t, float16x4_t, float16x8x2_t, float16_t, v4hf,
--		 v8hf, hf, f16, float16x8_t)
--__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v2sf, v4sf,
--		 sf, f32, float32x4_t)
--__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, df, v2df,
--		 df, f64, float64x2_t)
--__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v8qi, v16qi, qi, p8,
--		 int8x16_t)
--__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi,
--		 p16, int16x8_t)
--__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8,
--		 int8x16_t)
--__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16,
--		 int16x8_t)
--__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v2si, v4si, si, s32,
--		 int32x4_t)
--__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, di, v2di, di, s64,
--		 int64x2_t)
--__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v8qi, v16qi, qi, u8,
--		 int8x16_t)
--__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v4hi, v8hi, hi,
--		 u16, int16x8_t)
--__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v2si, v4si, si,
--		 u32, int32x4_t)
--__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, di, v2di, di,
--		 u64, int64x2_t)
-+  /* vmlsq_laneq  */
-+
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_laneq_f32 (float32x4_t __a, float32x4_t __b,
-+		float32x4_t __c, const int __lane)
-+{
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+}
- 
--#undef __LD2_LANE_FUNC
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_laneq_s16 (int16x8_t __a, int16x8_t __b,
-+		int16x8_t __c, const int __lane)
-+{
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+}
- 
--/* vld2q_lane */
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_laneq_s32 (int32x4_t __a, int32x4_t __b,
-+		int32x4_t __c, const int __lane)
-+{
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+}
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_laneq_u16 (uint16x8_t __a, uint16x8_t __b,
-+		uint16x8_t __c, const int __lane)
-+{
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+}
- 
--#define __LD2_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
--__extension__ static __inline intype __attribute__ ((__always_inline__))   \
--vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
--{									   \
--  __builtin_aarch64_simd_oi __o;					   \
--  intype ret;								   \
--  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); \
--  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); \
--  __o = __builtin_aarch64_ld2_lane##mode (				   \
--	(__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);		   \
--  ret.val[0] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 0);	   \
--  ret.val[1] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 1);	   \
--  return ret;								   \
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmlsq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,
-+		uint32x4_t __c, const int __lane)
-+{
-+  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
- }
- 
--__LD2_LANE_FUNC (float16x8x2_t, float16x8_t, float16_t, v8hf, hf, f16)
--__LD2_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32)
--__LD2_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64)
--__LD2_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8)
--__LD2_LANE_FUNC (poly16x8x2_t, poly16x8_t, poly16_t, v8hi, hi, p16)
--__LD2_LANE_FUNC (int8x16x2_t, int8x16_t, int8_t, v16qi, qi, s8)
--__LD2_LANE_FUNC (int16x8x2_t, int16x8_t, int16_t, v8hi, hi, s16)
--__LD2_LANE_FUNC (int32x4x2_t, int32x4_t, int32_t, v4si, si, s32)
--__LD2_LANE_FUNC (int64x2x2_t, int64x2_t, int64_t, v2di, di, s64)
--__LD2_LANE_FUNC (uint8x16x2_t, uint8x16_t, uint8_t, v16qi, qi, u8)
--__LD2_LANE_FUNC (uint16x8x2_t, uint16x8_t, uint16_t, v8hi, hi, u16)
--__LD2_LANE_FUNC (uint32x4x2_t, uint32x4_t, uint32_t, v4si, si, u32)
--__LD2_LANE_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64)
-+/* vmov_n_  */
- 
--#undef __LD2_LANE_FUNC
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_f16 (float16_t __a)
-+{
-+  return vdup_n_f16 (__a);
-+}
- 
--/* vld3_lane */
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_f32 (float32_t __a)
-+{
-+  return vdup_n_f32 (__a);
-+}
- 
--#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype, mode,	   \
--			 qmode, ptrmode, funcsuffix, signedtype)	   \
--__extension__ static __inline intype __attribute__ ((__always_inline__))   \
--vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
--{									   \
--  __builtin_aarch64_simd_ci __o;					   \
--  largetype __temp;							   \
--  __temp.val[0] =							   \
--    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));	   \
--  __temp.val[1] =							   \
--    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));	   \
--  __temp.val[2] =							   \
--    vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0));	   \
--  __o = __builtin_aarch64_set_qregci##qmode (__o,			   \
--					    (signedtype) __temp.val[0],	   \
--					    0);				   \
--  __o = __builtin_aarch64_set_qregci##qmode (__o,			   \
--					    (signedtype) __temp.val[1],	   \
--					    1);				   \
--  __o = __builtin_aarch64_set_qregci##qmode (__o,			   \
--					    (signedtype) __temp.val[2],	   \
--					    2);				   \
--  __o =	__builtin_aarch64_ld3_lane##mode (				   \
--	  (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);	   \
--  __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0);	   \
--  __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1);	   \
--  __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2);	   \
--  return __b;								   \
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_f64 (float64_t __a)
-+{
-+  return (float64x1_t) {__a};
- }
- 
--__LD3_LANE_FUNC (float16x4x3_t, float16x4_t, float16x8x3_t, float16_t, v4hf,
--		 v8hf, hf, f16, float16x8_t)
--__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v2sf, v4sf,
--		 sf, f32, float32x4_t)
--__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, df, v2df,
--		 df, f64, float64x2_t)
--__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v8qi, v16qi, qi, p8,
--		 int8x16_t)
--__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v4hi, v8hi, hi,
--		 p16, int16x8_t)
--__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v8qi, v16qi, qi, s8,
--		 int8x16_t)
--__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v4hi, v8hi, hi, s16,
--		 int16x8_t)
--__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v2si, v4si, si, s32,
--		 int32x4_t)
--__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, di, v2di, di, s64,
--		 int64x2_t)
--__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v8qi, v16qi, qi, u8,
--		 int8x16_t)
--__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v4hi, v8hi, hi,
--		 u16, int16x8_t)
--__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v2si, v4si, si,
--		 u32, int32x4_t)
--__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, di, v2di, di,
--		 u64, int64x2_t)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_p8 (poly8_t __a)
-+{
-+  return vdup_n_p8 (__a);
-+}
- 
--#undef __LD3_LANE_FUNC
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_p16 (poly16_t __a)
-+{
-+  return vdup_n_p16 (__a);
-+}
- 
--/* vld3q_lane */
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_p64 (poly64_t __a)
-+{
-+  return vdup_n_p64 (__a);
-+}
- 
--#define __LD3_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
--__extension__ static __inline intype __attribute__ ((__always_inline__))   \
--vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
--{									   \
--  __builtin_aarch64_simd_ci __o;					   \
--  intype ret;								   \
--  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); \
--  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); \
--  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); \
--  __o = __builtin_aarch64_ld3_lane##mode (				   \
--	(__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);		   \
--  ret.val[0] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 0);	   \
--  ret.val[1] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 1);	   \
--  ret.val[2] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 2);	   \
--  return ret;								   \
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_s8 (int8_t __a)
-+{
-+  return vdup_n_s8 (__a);
- }
- 
--__LD3_LANE_FUNC (float16x8x3_t, float16x8_t, float16_t, v8hf, hf, f16)
--__LD3_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32)
--__LD3_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64)
--__LD3_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8)
--__LD3_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16)
--__LD3_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8)
--__LD3_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16)
--__LD3_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32)
--__LD3_LANE_FUNC (int64x2x3_t, int64x2_t, int64_t, v2di, di, s64)
--__LD3_LANE_FUNC (uint8x16x3_t, uint8x16_t, uint8_t, v16qi, qi, u8)
--__LD3_LANE_FUNC (uint16x8x3_t, uint16x8_t, uint16_t, v8hi, hi, u16)
--__LD3_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32)
--__LD3_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_s16 (int16_t __a)
-+{
-+  return vdup_n_s16 (__a);
-+}
- 
--#undef __LD3_LANE_FUNC
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_s32 (int32_t __a)
-+{
-+  return vdup_n_s32 (__a);
-+}
- 
--/* vld4_lane */
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_s64 (int64_t __a)
-+{
-+  return (int64x1_t) {__a};
-+}
- 
--#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype, mode,	   \
--			 qmode, ptrmode, funcsuffix, signedtype)	   \
--__extension__ static __inline intype __attribute__ ((__always_inline__))   \
--vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
--{									   \
--  __builtin_aarch64_simd_xi __o;					   \
--  largetype __temp;							   \
--  __temp.val[0] =							   \
--    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));	   \
--  __temp.val[1] =							   \
--    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));	   \
--  __temp.val[2] =							   \
--    vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0));	   \
--  __temp.val[3] =							   \
--    vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0));	   \
--  __o = __builtin_aarch64_set_qregxi##qmode (__o,			   \
--					    (signedtype) __temp.val[0],	   \
--					    0);				   \
--  __o = __builtin_aarch64_set_qregxi##qmode (__o,			   \
--					    (signedtype) __temp.val[1],	   \
--					    1);				   \
--  __o = __builtin_aarch64_set_qregxi##qmode (__o,			   \
--					    (signedtype) __temp.val[2],	   \
--					    2);				   \
--  __o = __builtin_aarch64_set_qregxi##qmode (__o,			   \
--					    (signedtype) __temp.val[3],	   \
--					    3);				   \
--  __o =	__builtin_aarch64_ld4_lane##mode (				   \
--	  (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);	   \
--  __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0);	   \
--  __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1);	   \
--  __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2);	   \
--  __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3);	   \
--  return __b;								   \
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_u8 (uint8_t __a)
-+{
-+  return vdup_n_u8 (__a);
- }
- 
--/* vld4q_lane */
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_u16 (uint16_t __a)
-+{
-+    return vdup_n_u16 (__a);
-+}
- 
--__LD4_LANE_FUNC (float16x4x4_t, float16x4_t, float16x8x4_t, float16_t, v4hf,
--		 v8hf, hf, f16, float16x8_t)
--__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v2sf, v4sf,
--		 sf, f32, float32x4_t)
--__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, df, v2df,
--		 df, f64, float64x2_t)
--__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v8qi, v16qi, qi, p8,
--		 int8x16_t)
--__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v4hi, v8hi, hi,
--		 p16, int16x8_t)
--__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v8qi, v16qi, qi, s8,
--		 int8x16_t)
--__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v4hi, v8hi, hi, s16,
--		 int16x8_t)
--__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v2si, v4si, si, s32,
--		 int32x4_t)
--__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, di, v2di, di, s64,
--		 int64x2_t)
--__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v8qi, v16qi, qi, u8,
--		 int8x16_t)
--__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v4hi, v8hi, hi,
--		 u16, int16x8_t)
--__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v2si, v4si, si,
--		 u32, int32x4_t)
--__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, di, v2di, di,
--		 u64, int64x2_t)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_u32 (uint32_t __a)
-+{
-+   return vdup_n_u32 (__a);
-+}
-+
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_u64 (uint64_t __a)
-+{
-+  return (uint64x1_t) {__a};
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_f16 (float16_t __a)
-+{
-+  return vdupq_n_f16 (__a);
-+}
-+
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_f32 (float32_t __a)
-+{
-+  return vdupq_n_f32 (__a);
-+}
-+
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_f64 (float64_t __a)
-+{
-+  return vdupq_n_f64 (__a);
-+}
- 
--#undef __LD4_LANE_FUNC
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_p8 (poly8_t __a)
-+{
-+  return vdupq_n_p8 (__a);
-+}
- 
--/* vld4q_lane */
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_p16 (poly16_t __a)
-+{
-+  return vdupq_n_p16 (__a);
-+}
- 
--#define __LD4_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
--__extension__ static __inline intype __attribute__ ((__always_inline__))   \
--vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
--{									   \
--  __builtin_aarch64_simd_xi __o;					   \
--  intype ret;								   \
--  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); \
--  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); \
--  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); \
--  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); \
--  __o = __builtin_aarch64_ld4_lane##mode (				   \
--	(__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);		   \
--  ret.val[0] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 0);	   \
--  ret.val[1] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 1);	   \
--  ret.val[2] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 2);	   \
--  ret.val[3] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 3);	   \
--  return ret;								   \
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_p64 (poly64_t __a)
-+{
-+  return vdupq_n_p64 (__a);
- }
- 
--__LD4_LANE_FUNC (float16x8x4_t, float16x8_t, float16_t, v8hf, hf, f16)
--__LD4_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32)
--__LD4_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64)
--__LD4_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8)
--__LD4_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16)
--__LD4_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8)
--__LD4_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16)
--__LD4_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32)
--__LD4_LANE_FUNC (int64x2x4_t, int64x2_t, int64_t, v2di, di, s64)
--__LD4_LANE_FUNC (uint8x16x4_t, uint8x16_t, uint8_t, v16qi, qi, u8)
--__LD4_LANE_FUNC (uint16x8x4_t, uint16x8_t, uint16_t, v8hi, hi, u16)
--__LD4_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32)
--__LD4_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_s8 (int8_t __a)
-+{
-+  return vdupq_n_s8 (__a);
-+}
- 
--#undef __LD4_LANE_FUNC
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_s16 (int16_t __a)
-+{
-+  return vdupq_n_s16 (__a);
-+}
- 
--/* vmax */
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_s32 (int32_t __a)
-+{
-+  return vdupq_n_s32 (__a);
-+}
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vmax_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_s64 (int64_t __a)
- {
--  return __builtin_aarch64_smax_nanv2sf (__a, __b);
-+  return vdupq_n_s64 (__a);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vmax_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_u8 (uint8_t __a)
- {
--  return __builtin_aarch64_smaxv8qi (__a, __b);
-+  return vdupq_n_u8 (__a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vmax_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_u16 (uint16_t __a)
- {
--  return __builtin_aarch64_smaxv4hi (__a, __b);
-+  return vdupq_n_u16 (__a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vmax_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_u32 (uint32_t __a)
- {
--  return __builtin_aarch64_smaxv2si (__a, __b);
-+  return vdupq_n_u32 (__a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vmax_u8 (uint8x8_t __a, uint8x8_t __b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_u64 (uint64_t __a)
- {
--  return (uint8x8_t) __builtin_aarch64_umaxv8qi ((int8x8_t) __a,
--						 (int8x8_t) __b);
-+  return vdupq_n_u64 (__a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vmax_u16 (uint16x4_t __a, uint16x4_t __b)
-+/* vmul_lane  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_lane_f32 (float32x2_t __a, float32x2_t __b, const int __lane)
- {
--  return (uint16x4_t) __builtin_aarch64_umaxv4hi ((int16x4_t) __a,
--						  (int16x4_t) __b);
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vmax_u32 (uint32x2_t __a, uint32x2_t __b)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_lane_f64 (float64x1_t __a, float64x1_t __b, const int __lane)
- {
--  return (uint32x2_t) __builtin_aarch64_umaxv2si ((int32x2_t) __a,
--						  (int32x2_t) __b);
-+  return __a * __b;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vmaxq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_lane_s16 (int16x4_t __a, int16x4_t __b, const int __lane)
- {
--  return __builtin_aarch64_smax_nanv4sf (__a, __b);
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vmaxq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_lane_s32 (int32x2_t __a, int32x2_t __b, const int __lane)
- {
--  return __builtin_aarch64_smax_nanv2df (__a, __b);
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vmaxq_s8 (int8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __lane)
- {
--  return __builtin_aarch64_smaxv16qi (__a, __b);
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmaxq_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __lane)
- {
--  return __builtin_aarch64_smaxv8hi (__a, __b);
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmaxq_s32 (int32x4_t __a, int32x4_t __b)
-+/* vmuld_lane  */
-+
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmuld_lane_f64 (float64_t __a, float64x1_t __b, const int __lane)
- {
--  return __builtin_aarch64_smaxv4si (__a, __b);
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vmaxq_u8 (uint8x16_t __a, uint8x16_t __b)
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmuld_laneq_f64 (float64_t __a, float64x2_t __b, const int __lane)
- {
--  return (uint8x16_t) __builtin_aarch64_umaxv16qi ((int8x16_t) __a,
--						   (int8x16_t) __b);
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmaxq_u16 (uint16x8_t __a, uint16x8_t __b)
-+/* vmuls_lane  */
-+
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmuls_lane_f32 (float32_t __a, float32x2_t __b, const int __lane)
- {
--  return (uint16x8_t) __builtin_aarch64_umaxv8hi ((int16x8_t) __a,
--						  (int16x8_t) __b);
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmaxq_u32 (uint32x4_t __a, uint32x4_t __b)
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmuls_laneq_f32 (float32_t __a, float32x4_t __b, const int __lane)
- {
--  return (uint32x4_t) __builtin_aarch64_umaxv4si ((int32x4_t) __a,
--						  (int32x4_t) __b);
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
--/* vmulx */
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vmulx_f32 (float32x2_t __a, float32x2_t __b)
-+/* vmul_laneq  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_laneq_f32 (float32x2_t __a, float32x4_t __b, const int __lane)
- {
--  return __builtin_aarch64_fmulxv2sf (__a, __b);
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vmulxq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_laneq_f64 (float64x1_t __a, float64x2_t __b, const int __lane)
- {
--  return __builtin_aarch64_fmulxv4sf (__a, __b);
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vmulx_f64 (float64x1_t __a, float64x1_t __b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __lane)
- {
--  return (float64x1_t) {__builtin_aarch64_fmulxdf (__a[0], __b[0])};
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vmulxq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __lane)
- {
--  return __builtin_aarch64_fmulxv2df (__a, __b);
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vmulxs_f32 (float32_t __a, float32_t __b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_laneq_u16 (uint16x4_t __a, uint16x8_t __b, const int __lane)
- {
--  return __builtin_aarch64_fmulxsf (__a, __b);
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vmulxd_f64 (float64_t __a, float64_t __b)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_laneq_u32 (uint32x2_t __a, uint32x4_t __b, const int __lane)
- {
--  return __builtin_aarch64_fmulxdf (__a, __b);
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vmulx_lane_f32 (float32x2_t __a, float32x2_t __v, const int __lane)
-+/* vmul_n  */
-+
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_n_f64  (float64x1_t __a, float64_t __b)
- {
--  return vmulx_f32 (__a, __aarch64_vdup_lane_f32 (__v, __lane));
-+  return (float64x1_t) { vget_lane_f64 (__a, 0) * __b };
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vmulx_lane_f64 (float64x1_t __a, float64x1_t __v, const int __lane)
-+/* vmulq_lane  */
-+
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_lane_f32 (float32x4_t __a, float32x2_t __b, const int __lane)
- {
--  return vmulx_f64 (__a, __aarch64_vdup_lane_f64 (__v, __lane));
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vmulxq_lane_f32 (float32x4_t __a, float32x2_t __v, const int __lane)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_lane_f64 (float64x2_t __a, float64x1_t __b, const int __lane)
- {
--  return vmulxq_f32 (__a, __aarch64_vdupq_lane_f32 (__v, __lane));
-+  __AARCH64_LANE_CHECK (__a, __lane);
-+  return __a * __b[0];
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vmulxq_lane_f64 (float64x2_t __a, float64x1_t __v, const int __lane)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __lane)
- {
--  return vmulxq_f64 (__a, __aarch64_vdupq_lane_f64 (__v, __lane));
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vmulx_laneq_f32 (float32x2_t __a, float32x4_t __v, const int __lane)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __lane)
- {
--  return vmulx_f32 (__a, __aarch64_vdup_laneq_f32 (__v, __lane));
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vmulx_laneq_f64 (float64x1_t __a, float64x2_t __v, const int __lane)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __lane)
- {
--  return vmulx_f64 (__a, __aarch64_vdup_laneq_f64 (__v, __lane));
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vmulxq_laneq_f32 (float32x4_t __a, float32x4_t __v, const int __lane)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __lane)
- {
--  return vmulxq_f32 (__a, __aarch64_vdupq_laneq_f32 (__v, __lane));
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vmulxq_laneq_f64 (float64x2_t __a, float64x2_t __v, const int __lane)
-+/* vmulq_laneq  */
-+
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_laneq_f32 (float32x4_t __a, float32x4_t __b, const int __lane)
- {
--  return vmulxq_f64 (__a, __aarch64_vdupq_laneq_f64 (__v, __lane));
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vmulxs_lane_f32 (float32_t __a, float32x2_t __v, const int __lane)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_laneq_f64 (float64x2_t __a, float64x2_t __b, const int __lane)
- {
--  return vmulxs_f32 (__a, __aarch64_vget_lane_any (__v, __lane));
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vmulxs_laneq_f32 (float32_t __a, float32x4_t __v, const int __lane)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __lane)
- {
--  return vmulxs_f32 (__a, __aarch64_vget_lane_any (__v, __lane));
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vmulxd_lane_f64 (float64_t __a, float64x1_t __v, const int __lane)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __lane)
- {
--  return vmulxd_f64 (__a, __aarch64_vget_lane_any (__v, __lane));
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vmulxd_laneq_f64 (float64_t __a, float64x2_t __v, const int __lane)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __lane)
- {
--  return vmulxd_f64 (__a, __aarch64_vget_lane_any (__v, __lane));
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--/* vpmax  */
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __lane)
-+{
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
-+}
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vpmax_s8 (int8x8_t a, int8x8_t b)
-+/* vmul_n.  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_n_f32 (float32x2_t __a, float32_t __b)
- {
--  return __builtin_aarch64_smaxpv8qi (a, b);
-+  return __a * __b;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vpmax_s16 (int16x4_t a, int16x4_t b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_n_f32 (float32x4_t __a, float32_t __b)
- {
--  return __builtin_aarch64_smaxpv4hi (a, b);
-+  return __a * __b;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vpmax_s32 (int32x2_t a, int32x2_t b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_n_f64 (float64x2_t __a, float64_t __b)
- {
--  return __builtin_aarch64_smaxpv2si (a, b);
-+  return __a * __b;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vpmax_u8 (uint8x8_t a, uint8x8_t b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_n_s16 (int16x4_t __a, int16_t __b)
- {
--  return (uint8x8_t) __builtin_aarch64_umaxpv8qi ((int8x8_t) a,
--						  (int8x8_t) b);
-+  return __a * __b;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vpmax_u16 (uint16x4_t a, uint16x4_t b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_n_s16 (int16x8_t __a, int16_t __b)
- {
--  return (uint16x4_t) __builtin_aarch64_umaxpv4hi ((int16x4_t) a,
--						   (int16x4_t) b);
-+  return __a * __b;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vpmax_u32 (uint32x2_t a, uint32x2_t b)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_n_s32 (int32x2_t __a, int32_t __b)
- {
--  return (uint32x2_t) __builtin_aarch64_umaxpv2si ((int32x2_t) a,
--						   (int32x2_t) b);
-+  return __a * __b;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vpmaxq_s8 (int8x16_t a, int8x16_t b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_n_s32 (int32x4_t __a, int32_t __b)
- {
--  return __builtin_aarch64_smaxpv16qi (a, b);
-+  return __a * __b;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vpmaxq_s16 (int16x8_t a, int16x8_t b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_n_u16 (uint16x4_t __a, uint16_t __b)
- {
--  return __builtin_aarch64_smaxpv8hi (a, b);
-+  return __a * __b;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vpmaxq_s32 (int32x4_t a, int32x4_t b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
-+{
-+  return __a * __b;
-+}
-+
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_n_u32 (uint32x2_t __a, uint32_t __b)
-+{
-+  return __a * __b;
-+}
-+
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
-+{
-+  return __a * __b;
-+}
-+
-+/* vmvn  */
-+
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvn_p8 (poly8x8_t __a)
- {
--  return __builtin_aarch64_smaxpv4si (a, b);
-+  return (poly8x8_t) ~((int8x8_t) __a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vpmaxq_u8 (uint8x16_t a, uint8x16_t b)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvn_s8 (int8x8_t __a)
- {
--  return (uint8x16_t) __builtin_aarch64_umaxpv16qi ((int8x16_t) a,
--						    (int8x16_t) b);
-+  return ~__a;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vpmaxq_u16 (uint16x8_t a, uint16x8_t b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvn_s16 (int16x4_t __a)
- {
--  return (uint16x8_t) __builtin_aarch64_umaxpv8hi ((int16x8_t) a,
--						   (int16x8_t) b);
-+  return ~__a;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vpmaxq_u32 (uint32x4_t a, uint32x4_t b)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvn_s32 (int32x2_t __a)
- {
--  return (uint32x4_t) __builtin_aarch64_umaxpv4si ((int32x4_t) a,
--						   (int32x4_t) b);
-+  return ~__a;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vpmax_f32 (float32x2_t a, float32x2_t b)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvn_u8 (uint8x8_t __a)
- {
--  return __builtin_aarch64_smax_nanpv2sf (a, b);
-+  return ~__a;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vpmaxq_f32 (float32x4_t a, float32x4_t b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvn_u16 (uint16x4_t __a)
- {
--  return __builtin_aarch64_smax_nanpv4sf (a, b);
-+  return ~__a;
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vpmaxq_f64 (float64x2_t a, float64x2_t b)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvn_u32 (uint32x2_t __a)
- {
--  return __builtin_aarch64_smax_nanpv2df (a, b);
-+  return ~__a;
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vpmaxqd_f64 (float64x2_t a)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvnq_p8 (poly8x16_t __a)
- {
--  return __builtin_aarch64_reduc_smax_nan_scal_v2df (a);
-+  return (poly8x16_t) ~((int8x16_t) __a);
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vpmaxs_f32 (float32x2_t a)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvnq_s8 (int8x16_t __a)
- {
--  return __builtin_aarch64_reduc_smax_nan_scal_v2sf (a);
-+  return ~__a;
- }
- 
--/* vpmaxnm  */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vpmaxnm_f32 (float32x2_t a, float32x2_t b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvnq_s16 (int16x8_t __a)
- {
--  return __builtin_aarch64_smaxpv2sf (a, b);
-+  return ~__a;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vpmaxnmq_f32 (float32x4_t a, float32x4_t b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvnq_s32 (int32x4_t __a)
- {
--  return __builtin_aarch64_smaxpv4sf (a, b);
-+  return ~__a;
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vpmaxnmq_f64 (float64x2_t a, float64x2_t b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvnq_u8 (uint8x16_t __a)
- {
--  return __builtin_aarch64_smaxpv2df (a, b);
-+  return ~__a;
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vpmaxnmqd_f64 (float64x2_t a)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvnq_u16 (uint16x8_t __a)
- {
--  return __builtin_aarch64_reduc_smax_scal_v2df (a);
-+  return ~__a;
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vpmaxnms_f32 (float32x2_t a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmvnq_u32 (uint32x4_t __a)
- {
--  return __builtin_aarch64_reduc_smax_scal_v2sf (a);
-+  return ~__a;
- }
- 
--/* vpmin  */
-+/* vneg  */
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vpmin_s8 (int8x8_t a, int8x8_t b)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vneg_f32 (float32x2_t __a)
- {
--  return __builtin_aarch64_sminpv8qi (a, b);
-+  return -__a;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vpmin_s16 (int16x4_t a, int16x4_t b)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vneg_f64 (float64x1_t __a)
- {
--  return __builtin_aarch64_sminpv4hi (a, b);
-+  return -__a;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vpmin_s32 (int32x2_t a, int32x2_t b)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vneg_s8 (int8x8_t __a)
- {
--  return __builtin_aarch64_sminpv2si (a, b);
-+  return -__a;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vpmin_u8 (uint8x8_t a, uint8x8_t b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vneg_s16 (int16x4_t __a)
- {
--  return (uint8x8_t) __builtin_aarch64_uminpv8qi ((int8x8_t) a,
--						  (int8x8_t) b);
-+  return -__a;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vpmin_u16 (uint16x4_t a, uint16x4_t b)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vneg_s32 (int32x2_t __a)
- {
--  return (uint16x4_t) __builtin_aarch64_uminpv4hi ((int16x4_t) a,
--						   (int16x4_t) b);
-+  return -__a;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vpmin_u32 (uint32x2_t a, uint32x2_t b)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vneg_s64 (int64x1_t __a)
- {
--  return (uint32x2_t) __builtin_aarch64_uminpv2si ((int32x2_t) a,
--						   (int32x2_t) b);
-+  return -__a;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vpminq_s8 (int8x16_t a, int8x16_t b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vnegq_f32 (float32x4_t __a)
- {
--  return __builtin_aarch64_sminpv16qi (a, b);
-+  return -__a;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vpminq_s16 (int16x8_t a, int16x8_t b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vnegq_f64 (float64x2_t __a)
- {
--  return __builtin_aarch64_sminpv8hi (a, b);
-+  return -__a;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vpminq_s32 (int32x4_t a, int32x4_t b)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vnegq_s8 (int8x16_t __a)
- {
--  return __builtin_aarch64_sminpv4si (a, b);
-+  return -__a;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vpminq_u8 (uint8x16_t a, uint8x16_t b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vnegq_s16 (int16x8_t __a)
- {
--  return (uint8x16_t) __builtin_aarch64_uminpv16qi ((int8x16_t) a,
--						    (int8x16_t) b);
-+  return -__a;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vpminq_u16 (uint16x8_t a, uint16x8_t b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vnegq_s32 (int32x4_t __a)
- {
--  return (uint16x8_t) __builtin_aarch64_uminpv8hi ((int16x8_t) a,
--						   (int16x8_t) b);
-+  return -__a;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vpminq_u32 (uint32x4_t a, uint32x4_t b)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vnegq_s64 (int64x2_t __a)
- {
--  return (uint32x4_t) __builtin_aarch64_uminpv4si ((int32x4_t) a,
--						   (int32x4_t) b);
-+  return -__a;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vpmin_f32 (float32x2_t a, float32x2_t b)
-+/* vpadd  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadd_f32 (float32x2_t __a, float32x2_t __b)
- {
--  return __builtin_aarch64_smin_nanpv2sf (a, b);
-+  return __builtin_aarch64_faddpv2sf (__a, __b);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vpminq_f32 (float32x4_t a, float32x4_t b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddq_f32 (float32x4_t __a, float32x4_t __b)
- {
--  return __builtin_aarch64_smin_nanpv4sf (a, b);
-+  return __builtin_aarch64_faddpv4sf (__a, __b);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vpminq_f64 (float64x2_t a, float64x2_t b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddq_f64 (float64x2_t __a, float64x2_t __b)
- {
--  return __builtin_aarch64_smin_nanpv2df (a, b);
-+  return __builtin_aarch64_faddpv2df (__a, __b);
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vpminqd_f64 (float64x2_t a)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadd_s8 (int8x8_t __a, int8x8_t __b)
- {
--  return __builtin_aarch64_reduc_smin_nan_scal_v2df (a);
-+  return __builtin_aarch64_addpv8qi (__a, __b);
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vpmins_f32 (float32x2_t a)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadd_s16 (int16x4_t __a, int16x4_t __b)
- {
--  return __builtin_aarch64_reduc_smin_nan_scal_v2sf (a);
-+  return __builtin_aarch64_addpv4hi (__a, __b);
- }
- 
--/* vpminnm  */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vpminnm_f32 (float32x2_t a, float32x2_t b)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadd_s32 (int32x2_t __a, int32x2_t __b)
- {
--  return __builtin_aarch64_sminpv2sf (a, b);
-+  return __builtin_aarch64_addpv2si (__a, __b);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vpminnmq_f32 (float32x4_t a, float32x4_t b)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadd_u8 (uint8x8_t __a, uint8x8_t __b)
- {
--  return __builtin_aarch64_sminpv4sf (a, b);
-+  return (uint8x8_t) __builtin_aarch64_addpv8qi ((int8x8_t) __a,
-+						 (int8x8_t) __b);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vpminnmq_f64 (float64x2_t a, float64x2_t b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadd_u16 (uint16x4_t __a, uint16x4_t __b)
- {
--  return __builtin_aarch64_sminpv2df (a, b);
-+  return (uint16x4_t) __builtin_aarch64_addpv4hi ((int16x4_t) __a,
-+						  (int16x4_t) __b);
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vpminnmqd_f64 (float64x2_t a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadd_u32 (uint32x2_t __a, uint32x2_t __b)
- {
--  return __builtin_aarch64_reduc_smin_scal_v2df (a);
-+  return (uint32x2_t) __builtin_aarch64_addpv2si ((int32x2_t) __a,
-+						  (int32x2_t) __b);
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vpminnms_f32 (float32x2_t a)
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadds_f32 (float32x2_t __a)
- {
--  return __builtin_aarch64_reduc_smin_scal_v2sf (a);
-+  return __builtin_aarch64_reduc_plus_scal_v2sf (__a);
- }
- 
--/* vmaxnm  */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vmaxnm_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddd_f64 (float64x2_t __a)
- {
--  return __builtin_aarch64_fmaxv2sf (__a, __b);
-+  return __builtin_aarch64_reduc_plus_scal_v2df (__a);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vmaxnmq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddd_s64 (int64x2_t __a)
- {
--  return __builtin_aarch64_fmaxv4sf (__a, __b);
-+  return __builtin_aarch64_addpdi (__a);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vmaxnmq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddd_u64 (uint64x2_t __a)
- {
--  return __builtin_aarch64_fmaxv2df (__a, __b);
-+  return __builtin_aarch64_addpdi ((int64x2_t) __a);
- }
- 
--/* vmaxv  */
-+/* vqabs */
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vmaxv_f32 (float32x2_t __a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqabsq_s64 (int64x2_t __a)
- {
--  return __builtin_aarch64_reduc_smax_nan_scal_v2sf (__a);
-+  return (int64x2_t) __builtin_aarch64_sqabsv2di (__a);
- }
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vmaxv_s8 (int8x8_t __a)
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqabsb_s8 (int8_t __a)
- {
--  return __builtin_aarch64_reduc_smax_scal_v8qi (__a);
-+  return (int8_t) __builtin_aarch64_sqabsqi (__a);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vmaxv_s16 (int16x4_t __a)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqabsh_s16 (int16_t __a)
- {
--  return __builtin_aarch64_reduc_smax_scal_v4hi (__a);
-+  return (int16_t) __builtin_aarch64_sqabshi (__a);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vmaxv_s32 (int32x2_t __a)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqabss_s32 (int32_t __a)
- {
--  return __builtin_aarch64_reduc_smax_scal_v2si (__a);
-+  return (int32_t) __builtin_aarch64_sqabssi (__a);
- }
- 
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vmaxv_u8 (uint8x8_t __a)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqabsd_s64 (int64_t __a)
- {
--  return __builtin_aarch64_reduc_umax_scal_v8qi_uu (__a);
-+  return __builtin_aarch64_sqabsdi (__a);
- }
- 
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vmaxv_u16 (uint16x4_t __a)
--{
--  return __builtin_aarch64_reduc_umax_scal_v4hi_uu (__a);
--}
-+/* vqadd */
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vmaxv_u32 (uint32x2_t __a)
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqaddb_s8 (int8_t __a, int8_t __b)
- {
--  return __builtin_aarch64_reduc_umax_scal_v2si_uu (__a);
-+  return (int8_t) __builtin_aarch64_sqaddqi (__a, __b);
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vmaxvq_f32 (float32x4_t __a)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqaddh_s16 (int16_t __a, int16_t __b)
- {
--  return __builtin_aarch64_reduc_smax_nan_scal_v4sf (__a);
-+  return (int16_t) __builtin_aarch64_sqaddhi (__a, __b);
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vmaxvq_f64 (float64x2_t __a)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqadds_s32 (int32_t __a, int32_t __b)
- {
--  return __builtin_aarch64_reduc_smax_nan_scal_v2df (__a);
-+  return (int32_t) __builtin_aarch64_sqaddsi (__a, __b);
- }
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vmaxvq_s8 (int8x16_t __a)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqaddd_s64 (int64_t __a, int64_t __b)
- {
--  return __builtin_aarch64_reduc_smax_scal_v16qi (__a);
-+  return __builtin_aarch64_sqadddi (__a, __b);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vmaxvq_s16 (int16x8_t __a)
-+__extension__ extern __inline uint8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqaddb_u8 (uint8_t __a, uint8_t __b)
- {
--  return __builtin_aarch64_reduc_smax_scal_v8hi (__a);
-+  return (uint8_t) __builtin_aarch64_uqaddqi_uuu (__a, __b);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vmaxvq_s32 (int32x4_t __a)
-+__extension__ extern __inline uint16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqaddh_u16 (uint16_t __a, uint16_t __b)
- {
--  return __builtin_aarch64_reduc_smax_scal_v4si (__a);
-+  return (uint16_t) __builtin_aarch64_uqaddhi_uuu (__a, __b);
- }
- 
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vmaxvq_u8 (uint8x16_t __a)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqadds_u32 (uint32_t __a, uint32_t __b)
- {
--  return __builtin_aarch64_reduc_umax_scal_v16qi_uu (__a);
-+  return (uint32_t) __builtin_aarch64_uqaddsi_uuu (__a, __b);
- }
- 
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vmaxvq_u16 (uint16x8_t __a)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqaddd_u64 (uint64_t __a, uint64_t __b)
- {
--  return __builtin_aarch64_reduc_umax_scal_v8hi_uu (__a);
-+  return __builtin_aarch64_uqadddi_uuu (__a, __b);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vmaxvq_u32 (uint32x4_t __a)
-+/* vqdmlal */
-+
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
- {
--  return __builtin_aarch64_reduc_umax_scal_v4si_uu (__a);
-+  return __builtin_aarch64_sqdmlalv4hi (__a, __b, __c);
- }
- 
--/* vmaxnmv  */
--
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vmaxnmv_f32 (float32x2_t __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c)
- {
--  return __builtin_aarch64_reduc_smax_scal_v2sf (__a);
-+  return __builtin_aarch64_sqdmlal2v8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vmaxnmvq_f32 (float32x4_t __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_high_lane_s16 (int32x4_t __a, int16x8_t __b, int16x4_t __c,
-+		       int const __d)
- {
--  return __builtin_aarch64_reduc_smax_scal_v4sf (__a);
-+  return __builtin_aarch64_sqdmlal2_lanev8hi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vmaxnmvq_f64 (float64x2_t __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_high_laneq_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c,
-+			int const __d)
- {
--  return __builtin_aarch64_reduc_smax_scal_v2df (__a);
-+  return __builtin_aarch64_sqdmlal2_laneqv8hi (__a, __b, __c, __d);
- }
- 
--/* vmin  */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vmin_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c)
- {
--  return __builtin_aarch64_smin_nanv2sf (__a, __b);
-+  return __builtin_aarch64_sqdmlal2_nv8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vmin_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, int const __d)
- {
--  return __builtin_aarch64_sminv8qi (__a, __b);
-+  return __builtin_aarch64_sqdmlal_lanev4hi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vmin_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_laneq_s16 (int32x4_t __a, int16x4_t __b, int16x8_t __c, int const __d)
- {
--  return __builtin_aarch64_sminv4hi (__a, __b);
-+  return __builtin_aarch64_sqdmlal_laneqv4hi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vmin_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
- {
--  return __builtin_aarch64_sminv2si (__a, __b);
-+  return __builtin_aarch64_sqdmlal_nv4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vmin_u8 (uint8x8_t __a, uint8x8_t __b)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
- {
--  return (uint8x8_t) __builtin_aarch64_uminv8qi ((int8x8_t) __a,
--						 (int8x8_t) __b);
-+  return __builtin_aarch64_sqdmlalv2si (__a, __b, __c);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vmin_u16 (uint16x4_t __a, uint16x4_t __b)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c)
- {
--  return (uint16x4_t) __builtin_aarch64_uminv4hi ((int16x4_t) __a,
--						  (int16x4_t) __b);
-+  return __builtin_aarch64_sqdmlal2v4si (__a, __b, __c);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vmin_u32 (uint32x2_t __a, uint32x2_t __b)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_high_lane_s32 (int64x2_t __a, int32x4_t __b, int32x2_t __c,
-+		       int const __d)
- {
--  return (uint32x2_t) __builtin_aarch64_uminv2si ((int32x2_t) __a,
--						  (int32x2_t) __b);
-+  return __builtin_aarch64_sqdmlal2_lanev4si (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vminq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_high_laneq_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c,
-+			int const __d)
- {
--  return __builtin_aarch64_smin_nanv4sf (__a, __b);
-+  return __builtin_aarch64_sqdmlal2_laneqv4si (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vminq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c)
- {
--  return __builtin_aarch64_smin_nanv2df (__a, __b);
-+  return __builtin_aarch64_sqdmlal2_nv4si (__a, __b, __c);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vminq_s8 (int8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, int const __d)
- {
--  return __builtin_aarch64_sminv16qi (__a, __b);
-+  return __builtin_aarch64_sqdmlal_lanev2si (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vminq_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_laneq_s32 (int64x2_t __a, int32x2_t __b, int32x4_t __c, int const __d)
- {
--  return __builtin_aarch64_sminv8hi (__a, __b);
-+  return __builtin_aarch64_sqdmlal_laneqv2si (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vminq_s32 (int32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
- {
--  return __builtin_aarch64_sminv4si (__a, __b);
-+  return __builtin_aarch64_sqdmlal_nv2si (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vminq_u8 (uint8x16_t __a, uint8x16_t __b)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlalh_s16 (int32_t __a, int16_t __b, int16_t __c)
- {
--  return (uint8x16_t) __builtin_aarch64_uminv16qi ((int8x16_t) __a,
--						   (int8x16_t) __b);
-+  return __builtin_aarch64_sqdmlalhi (__a, __b, __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vminq_u16 (uint16x8_t __a, uint16x8_t __b)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlalh_lane_s16 (int32_t __a, int16_t __b, int16x4_t __c, const int __d)
- {
--  return (uint16x8_t) __builtin_aarch64_uminv8hi ((int16x8_t) __a,
--						  (int16x8_t) __b);
-+  return __builtin_aarch64_sqdmlal_lanehi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vminq_u32 (uint32x4_t __a, uint32x4_t __b)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlalh_laneq_s16 (int32_t __a, int16_t __b, int16x8_t __c, const int __d)
- {
--  return (uint32x4_t) __builtin_aarch64_uminv4si ((int32x4_t) __a,
--						  (int32x4_t) __b);
-+  return __builtin_aarch64_sqdmlal_laneqhi (__a, __b, __c, __d);
- }
- 
--/* vminnm  */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vminnm_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlals_s32 (int64_t __a, int32_t __b, int32_t __c)
- {
--  return __builtin_aarch64_fminv2sf (__a, __b);
-+  return __builtin_aarch64_sqdmlalsi (__a, __b, __c);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vminnmq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlals_lane_s32 (int64_t __a, int32_t __b, int32x2_t __c, const int __d)
- {
--  return __builtin_aarch64_fminv4sf (__a, __b);
-+  return __builtin_aarch64_sqdmlal_lanesi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vminnmq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlals_laneq_s32 (int64_t __a, int32_t __b, int32x4_t __c, const int __d)
- {
--  return __builtin_aarch64_fminv2df (__a, __b);
-+  return __builtin_aarch64_sqdmlal_laneqsi (__a, __b, __c, __d);
- }
- 
--/* vminv  */
-+/* vqdmlsl */
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vminv_f32 (float32x2_t __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
- {
--  return __builtin_aarch64_reduc_smin_nan_scal_v2sf (__a);
-+  return __builtin_aarch64_sqdmlslv4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vminv_s8 (int8x8_t __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c)
- {
--  return __builtin_aarch64_reduc_smin_scal_v8qi (__a);
-+  return __builtin_aarch64_sqdmlsl2v8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vminv_s16 (int16x4_t __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_high_lane_s16 (int32x4_t __a, int16x8_t __b, int16x4_t __c,
-+		       int const __d)
- {
--  return __builtin_aarch64_reduc_smin_scal_v4hi (__a);
-+  return __builtin_aarch64_sqdmlsl2_lanev8hi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vminv_s32 (int32x2_t __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_high_laneq_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c,
-+			int const __d)
- {
--  return __builtin_aarch64_reduc_smin_scal_v2si (__a);
-+  return __builtin_aarch64_sqdmlsl2_laneqv8hi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vminv_u8 (uint8x8_t __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c)
- {
--  return __builtin_aarch64_reduc_umin_scal_v8qi_uu (__a);
-+  return __builtin_aarch64_sqdmlsl2_nv8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vminv_u16 (uint16x4_t __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, int const __d)
- {
--  return __builtin_aarch64_reduc_umin_scal_v4hi_uu (__a);
-+  return __builtin_aarch64_sqdmlsl_lanev4hi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vminv_u32 (uint32x2_t __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_laneq_s16 (int32x4_t __a, int16x4_t __b, int16x8_t __c, int const __d)
- {
--  return __builtin_aarch64_reduc_umin_scal_v2si_uu (__a);
-+  return __builtin_aarch64_sqdmlsl_laneqv4hi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vminvq_f32 (float32x4_t __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
- {
--  return __builtin_aarch64_reduc_smin_nan_scal_v4sf (__a);
-+  return __builtin_aarch64_sqdmlsl_nv4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vminvq_f64 (float64x2_t __a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
- {
--  return __builtin_aarch64_reduc_smin_nan_scal_v2df (__a);
-+  return __builtin_aarch64_sqdmlslv2si (__a, __b, __c);
- }
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vminvq_s8 (int8x16_t __a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c)
- {
--  return __builtin_aarch64_reduc_smin_scal_v16qi (__a);
-+  return __builtin_aarch64_sqdmlsl2v4si (__a, __b, __c);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vminvq_s16 (int16x8_t __a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_high_lane_s32 (int64x2_t __a, int32x4_t __b, int32x2_t __c,
-+		       int const __d)
- {
--  return __builtin_aarch64_reduc_smin_scal_v8hi (__a);
-+  return __builtin_aarch64_sqdmlsl2_lanev4si (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vminvq_s32 (int32x4_t __a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_high_laneq_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c,
-+			int const __d)
- {
--  return __builtin_aarch64_reduc_smin_scal_v4si (__a);
-+  return __builtin_aarch64_sqdmlsl2_laneqv4si (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vminvq_u8 (uint8x16_t __a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c)
- {
--  return __builtin_aarch64_reduc_umin_scal_v16qi_uu (__a);
-+  return __builtin_aarch64_sqdmlsl2_nv4si (__a, __b, __c);
- }
- 
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vminvq_u16 (uint16x8_t __a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, int const __d)
- {
--  return __builtin_aarch64_reduc_umin_scal_v8hi_uu (__a);
-+  return __builtin_aarch64_sqdmlsl_lanev2si (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vminvq_u32 (uint32x4_t __a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_laneq_s32 (int64x2_t __a, int32x2_t __b, int32x4_t __c, int const __d)
- {
--  return __builtin_aarch64_reduc_umin_scal_v4si_uu (__a);
-+  return __builtin_aarch64_sqdmlsl_laneqv2si (__a, __b, __c, __d);
- }
- 
--/* vminnmv  */
--
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vminnmv_f32 (float32x2_t __a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
- {
--  return __builtin_aarch64_reduc_smin_scal_v2sf (__a);
-+  return __builtin_aarch64_sqdmlsl_nv2si (__a, __b, __c);
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vminnmvq_f32 (float32x4_t __a)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlslh_s16 (int32_t __a, int16_t __b, int16_t __c)
- {
--  return __builtin_aarch64_reduc_smin_scal_v4sf (__a);
-+  return __builtin_aarch64_sqdmlslhi (__a, __b, __c);
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vminnmvq_f64 (float64x2_t __a)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlslh_lane_s16 (int32_t __a, int16_t __b, int16x4_t __c, const int __d)
- {
--  return __builtin_aarch64_reduc_smin_scal_v2df (__a);
-+  return __builtin_aarch64_sqdmlsl_lanehi (__a, __b, __c, __d);
- }
- 
--/* vmla */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vmla_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlslh_laneq_s16 (int32_t __a, int16_t __b, int16x8_t __c, const int __d)
- {
--  return a + b * c;
-+  return __builtin_aarch64_sqdmlsl_laneqhi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vmla_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsls_s32 (int64_t __a, int32_t __b, int32_t __c)
- {
--  return __a + __b * __c;
-+  return __builtin_aarch64_sqdmlslsi (__a, __b, __c);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vmlaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsls_lane_s32 (int64_t __a, int32_t __b, int32x2_t __c, const int __d)
- {
--  return a + b * c;
-+  return __builtin_aarch64_sqdmlsl_lanesi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vmlaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmlsls_laneq_s32 (int64_t __a, int32_t __b, int32x4_t __c, const int __d)
- {
--  return a + b * c;
-+  return __builtin_aarch64_sqdmlsl_laneqsi (__a, __b, __c, __d);
- }
- 
--/* vmla_lane  */
-+/* vqdmulh */
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vmla_lane_f32 (float32x2_t __a, float32x2_t __b,
--	       float32x2_t __c, const int __lane)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
- {
--  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmulh_lanev4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vmla_lane_s16 (int16x4_t __a, int16x4_t __b,
--		int16x4_t __c, const int __lane)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
- {
--  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmulh_lanev2si (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vmla_lane_s32 (int32x2_t __a, int32x2_t __b,
--		int32x2_t __c, const int __lane)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
- {
--  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmulh_lanev8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vmla_lane_u16 (uint16x4_t __a, uint16x4_t __b,
--		uint16x4_t __c, const int __lane)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
- {
--  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmulh_lanev4si (__a, __b, __c);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vmla_lane_u32 (uint32x2_t __a, uint32x2_t __b,
--	       uint32x2_t __c, const int __lane)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulhh_s16 (int16_t __a, int16_t __b)
- {
--  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return (int16_t) __builtin_aarch64_sqdmulhhi (__a, __b);
- }
- 
--/* vmla_laneq  */
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulhh_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
-+{
-+  return __builtin_aarch64_sqdmulh_lanehi (__a, __b, __c);
-+}
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vmla_laneq_f32 (float32x2_t __a, float32x2_t __b,
--	        float32x4_t __c, const int __lane)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulhh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c)
- {
--  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmulh_laneqhi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vmla_laneq_s16 (int16x4_t __a, int16x4_t __b,
--		int16x8_t __c, const int __lane)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulhs_s32 (int32_t __a, int32_t __b)
- {
--  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return (int32_t) __builtin_aarch64_sqdmulhsi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vmla_laneq_s32 (int32x2_t __a, int32x2_t __b,
--		int32x4_t __c, const int __lane)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulhs_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
- {
--  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmulh_lanesi (__a, __b, __c);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vmla_laneq_u16 (uint16x4_t __a, uint16x4_t __b,
--		uint16x8_t __c, const int __lane)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulhs_laneq_s32 (int32_t __a, int32x4_t __b, const int __c)
- {
--  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmulh_laneqsi (__a, __b, __c);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vmla_laneq_u32 (uint32x2_t __a, uint32x2_t __b,
--		uint32x4_t __c, const int __lane)
-+/* vqdmull */
-+
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_s16 (int16x4_t __a, int16x4_t __b)
- {
--  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmullv4hi (__a, __b);
- }
- 
--/* vmlaq_lane  */
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_high_s16 (int16x8_t __a, int16x8_t __b)
-+{
-+  return __builtin_aarch64_sqdmull2v8hi (__a, __b);
-+}
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vmlaq_lane_f32 (float32x4_t __a, float32x4_t __b,
--		float32x2_t __c, const int __lane)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, int const __c)
- {
--  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull2_lanev8hi (__a, __b,__c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmlaq_lane_s16 (int16x8_t __a, int16x8_t __b,
--		int16x4_t __c, const int __lane)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, int const __c)
- {
--  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull2_laneqv8hi (__a, __b,__c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmlaq_lane_s32 (int32x4_t __a, int32x4_t __b,
--		int32x2_t __c, const int __lane)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_high_n_s16 (int16x8_t __a, int16_t __b)
- {
--  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull2_nv8hi (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmlaq_lane_u16 (uint16x8_t __a, uint16x8_t __b,
--		uint16x4_t __c, const int __lane)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_lane_s16 (int16x4_t __a, int16x4_t __b, int const __c)
- {
--  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull_lanev4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmlaq_lane_u32 (uint32x4_t __a, uint32x4_t __b,
--		uint32x2_t __c, const int __lane)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_laneq_s16 (int16x4_t __a, int16x8_t __b, int const __c)
- {
--  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull_laneqv4hi (__a, __b, __c);
- }
- 
--  /* vmlaq_laneq  */
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_n_s16 (int16x4_t __a, int16_t __b)
-+{
-+  return __builtin_aarch64_sqdmull_nv4hi (__a, __b);
-+}
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vmlaq_laneq_f32 (float32x4_t __a, float32x4_t __b,
--		 float32x4_t __c, const int __lane)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_s32 (int32x2_t __a, int32x2_t __b)
- {
--  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmullv2si (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmlaq_laneq_s16 (int16x8_t __a, int16x8_t __b,
--		int16x8_t __c, const int __lane)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_high_s32 (int32x4_t __a, int32x4_t __b)
- {
--  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull2v4si (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmlaq_laneq_s32 (int32x4_t __a, int32x4_t __b,
--		int32x4_t __c, const int __lane)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, int const __c)
- {
--  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull2_lanev4si (__a, __b, __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmlaq_laneq_u16 (uint16x8_t __a, uint16x8_t __b,
--		uint16x8_t __c, const int __lane)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, int const __c)
- {
--  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull2_laneqv4si (__a, __b, __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmlaq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,
--		uint32x4_t __c, const int __lane)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_high_n_s32 (int32x4_t __a, int32_t __b)
- {
--  return (__a + (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull2_nv4si (__a, __b);
- }
- 
--/* vmls  */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vmls_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_lane_s32 (int32x2_t __a, int32x2_t __b, int const __c)
- {
--  return a - b * c;
-+  return __builtin_aarch64_sqdmull_lanev2si (__a, __b, __c);
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vmls_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_laneq_s32 (int32x2_t __a, int32x4_t __b, int const __c)
- {
--  return __a - __b * __c;
-+  return __builtin_aarch64_sqdmull_laneqv2si (__a, __b, __c);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vmlsq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmull_n_s32 (int32x2_t __a, int32_t __b)
- {
--  return a - b * c;
-+  return __builtin_aarch64_sqdmull_nv2si (__a, __b);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vmlsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmullh_s16 (int16_t __a, int16_t __b)
- {
--  return a - b * c;
-+  return (int32_t) __builtin_aarch64_sqdmullhi (__a, __b);
- }
- 
--/* vmls_lane  */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vmls_lane_f32 (float32x2_t __a, float32x2_t __b,
--	       float32x2_t __c, const int __lane)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmullh_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
- {
--  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull_lanehi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vmls_lane_s16 (int16x4_t __a, int16x4_t __b,
--		int16x4_t __c, const int __lane)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmullh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c)
- {
--  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull_laneqhi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vmls_lane_s32 (int32x2_t __a, int32x2_t __b,
--		int32x2_t __c, const int __lane)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulls_s32 (int32_t __a, int32_t __b)
- {
--  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmullsi (__a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vmls_lane_u16 (uint16x4_t __a, uint16x4_t __b,
--		uint16x4_t __c, const int __lane)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulls_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
- {
--  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull_lanesi (__a, __b, __c);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vmls_lane_u32 (uint32x2_t __a, uint32x2_t __b,
--	       uint32x2_t __c, const int __lane)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqdmulls_laneq_s32 (int32_t __a, int32x4_t __b, const int __c)
- {
--  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return __builtin_aarch64_sqdmull_laneqsi (__a, __b, __c);
- }
- 
--/* vmls_laneq  */
-+/* vqmovn */
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vmls_laneq_f32 (float32x2_t __a, float32x2_t __b,
--	       float32x4_t __c, const int __lane)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovn_s16 (int16x8_t __a)
- {
--  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return (int8x8_t) __builtin_aarch64_sqmovnv8hi (__a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vmls_laneq_s16 (int16x4_t __a, int16x4_t __b,
--		int16x8_t __c, const int __lane)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovn_s32 (int32x4_t __a)
- {
--  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return (int16x4_t) __builtin_aarch64_sqmovnv4si (__a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vmls_laneq_s32 (int32x2_t __a, int32x2_t __b,
--		int32x4_t __c, const int __lane)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovn_s64 (int64x2_t __a)
- {
--  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return (int32x2_t) __builtin_aarch64_sqmovnv2di (__a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vmls_laneq_u16 (uint16x4_t __a, uint16x4_t __b,
--		uint16x8_t __c, const int __lane)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovn_u16 (uint16x8_t __a)
- {
--  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return (uint8x8_t) __builtin_aarch64_uqmovnv8hi ((int16x8_t) __a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vmls_laneq_u32 (uint32x2_t __a, uint32x2_t __b,
--		uint32x4_t __c, const int __lane)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovn_u32 (uint32x4_t __a)
- {
--  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return (uint16x4_t) __builtin_aarch64_uqmovnv4si ((int32x4_t) __a);
- }
- 
--/* vmlsq_lane  */
--
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vmlsq_lane_f32 (float32x4_t __a, float32x4_t __b,
--		float32x2_t __c, const int __lane)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovn_u64 (uint64x2_t __a)
- {
--  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return (uint32x2_t) __builtin_aarch64_uqmovnv2di ((int64x2_t) __a);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmlsq_lane_s16 (int16x8_t __a, int16x8_t __b,
--		int16x4_t __c, const int __lane)
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovnh_s16 (int16_t __a)
- {
--  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return (int8_t) __builtin_aarch64_sqmovnhi (__a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmlsq_lane_s32 (int32x4_t __a, int32x4_t __b,
--		int32x2_t __c, const int __lane)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovns_s32 (int32_t __a)
- {
--  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return (int16_t) __builtin_aarch64_sqmovnsi (__a);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmlsq_lane_u16 (uint16x8_t __a, uint16x8_t __b,
--		uint16x4_t __c, const int __lane)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovnd_s64 (int64_t __a)
- {
--  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return (int32_t) __builtin_aarch64_sqmovndi (__a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmlsq_lane_u32 (uint32x4_t __a, uint32x4_t __b,
--		uint32x2_t __c, const int __lane)
-+__extension__ extern __inline uint8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovnh_u16 (uint16_t __a)
- {
--  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return (uint8_t) __builtin_aarch64_uqmovnhi (__a);
- }
- 
--  /* vmlsq_laneq  */
--
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vmlsq_laneq_f32 (float32x4_t __a, float32x4_t __b,
--		float32x4_t __c, const int __lane)
-+__extension__ extern __inline uint16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovns_u32 (uint32_t __a)
- {
--  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return (uint16_t) __builtin_aarch64_uqmovnsi (__a);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmlsq_laneq_s16 (int16x8_t __a, int16x8_t __b,
--		int16x8_t __c, const int __lane)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovnd_u64 (uint64_t __a)
- {
--  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return (uint32_t) __builtin_aarch64_uqmovndi (__a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmlsq_laneq_s32 (int32x4_t __a, int32x4_t __b,
--		int32x4_t __c, const int __lane)
-+/* vqmovun */
-+
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovun_s16 (int16x8_t __a)
- {
--  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return (uint8x8_t) __builtin_aarch64_sqmovunv8hi (__a);
- }
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmlsq_laneq_u16 (uint16x8_t __a, uint16x8_t __b,
--		uint16x8_t __c, const int __lane)
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovun_s32 (int32x4_t __a)
- {
--  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return (uint16x4_t) __builtin_aarch64_sqmovunv4si (__a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmlsq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,
--		uint32x4_t __c, const int __lane)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovun_s64 (int64x2_t __a)
- {
--  return (__a - (__b * __aarch64_vget_lane_any (__c, __lane)));
-+  return (uint32x2_t) __builtin_aarch64_sqmovunv2di (__a);
- }
- 
--/* vmov_n_  */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vmov_n_f32 (float32_t __a)
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovunh_s16 (int16_t __a)
- {
--  return vdup_n_f32 (__a);
-+  return (int8_t) __builtin_aarch64_sqmovunhi (__a);
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vmov_n_f64 (float64_t __a)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovuns_s32 (int32_t __a)
- {
--  return (float64x1_t) {__a};
-+  return (int16_t) __builtin_aarch64_sqmovunsi (__a);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vmov_n_p8 (poly8_t __a)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqmovund_s64 (int64_t __a)
- {
--  return vdup_n_p8 (__a);
-+  return (int32_t) __builtin_aarch64_sqmovundi (__a);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
--vmov_n_p16 (poly16_t __a)
-+/* vqneg */
-+
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqnegq_s64 (int64x2_t __a)
- {
--  return vdup_n_p16 (__a);
-+  return (int64x2_t) __builtin_aarch64_sqnegv2di (__a);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vmov_n_s8 (int8_t __a)
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqnegb_s8 (int8_t __a)
- {
--  return vdup_n_s8 (__a);
-+  return (int8_t) __builtin_aarch64_sqnegqi (__a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vmov_n_s16 (int16_t __a)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqnegh_s16 (int16_t __a)
- {
--  return vdup_n_s16 (__a);
-+  return (int16_t) __builtin_aarch64_sqneghi (__a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vmov_n_s32 (int32_t __a)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqnegs_s32 (int32_t __a)
- {
--  return vdup_n_s32 (__a);
-+  return (int32_t) __builtin_aarch64_sqnegsi (__a);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vmov_n_s64 (int64_t __a)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqnegd_s64 (int64_t __a)
- {
--  return (int64x1_t) {__a};
-+  return __builtin_aarch64_sqnegdi (__a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vmov_n_u8 (uint8_t __a)
-+/* vqrdmulh */
-+
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
- {
--  return vdup_n_u8 (__a);
-+  return  __builtin_aarch64_sqrdmulh_lanev4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vmov_n_u16 (uint16_t __a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
- {
--    return vdup_n_u16 (__a);
-+  return __builtin_aarch64_sqrdmulh_lanev2si (__a, __b, __c);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vmov_n_u32 (uint32_t __a)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
- {
--   return vdup_n_u32 (__a);
-+  return __builtin_aarch64_sqrdmulh_lanev8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vmov_n_u64 (uint64_t __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
- {
--  return (uint64x1_t) {__a};
-+  return __builtin_aarch64_sqrdmulh_lanev4si (__a, __b, __c);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vmovq_n_f32 (float32_t __a)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulhh_s16 (int16_t __a, int16_t __b)
- {
--  return vdupq_n_f32 (__a);
-+  return (int16_t) __builtin_aarch64_sqrdmulhhi (__a, __b);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vmovq_n_f64 (float64_t __a)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulhh_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
- {
--  return vdupq_n_f64 (__a);
-+  return __builtin_aarch64_sqrdmulh_lanehi (__a, __b, __c);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vmovq_n_p8 (poly8_t __a)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulhh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c)
- {
--  return vdupq_n_p8 (__a);
-+  return __builtin_aarch64_sqrdmulh_laneqhi (__a, __b, __c);
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vmovq_n_p16 (poly16_t __a)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulhs_s32 (int32_t __a, int32_t __b)
- {
--  return vdupq_n_p16 (__a);
-+  return (int32_t) __builtin_aarch64_sqrdmulhsi (__a, __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vmovq_n_s8 (int8_t __a)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulhs_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
- {
--  return vdupq_n_s8 (__a);
-+  return __builtin_aarch64_sqrdmulh_lanesi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmovq_n_s16 (int16_t __a)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrdmulhs_laneq_s32 (int32_t __a, int32x4_t __b, const int __c)
- {
--  return vdupq_n_s16 (__a);
-+  return __builtin_aarch64_sqrdmulh_laneqsi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmovq_n_s32 (int32_t __a)
-+/* vqrshl */
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshl_s8 (int8x8_t __a, int8x8_t __b)
- {
--  return vdupq_n_s32 (__a);
-+  return __builtin_aarch64_sqrshlv8qi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vmovq_n_s64 (int64_t __a)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshl_s16 (int16x4_t __a, int16x4_t __b)
- {
--  return vdupq_n_s64 (__a);
-+  return __builtin_aarch64_sqrshlv4hi (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vmovq_n_u8 (uint8_t __a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshl_s32 (int32x2_t __a, int32x2_t __b)
- {
--  return vdupq_n_u8 (__a);
-+  return __builtin_aarch64_sqrshlv2si (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmovq_n_u16 (uint16_t __a)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshl_s64 (int64x1_t __a, int64x1_t __b)
- {
--  return vdupq_n_u16 (__a);
-+  return (int64x1_t) {__builtin_aarch64_sqrshldi (__a[0], __b[0])};
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmovq_n_u32 (uint32_t __a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshl_u8 (uint8x8_t __a, int8x8_t __b)
- {
--  return vdupq_n_u32 (__a);
-+  return __builtin_aarch64_uqrshlv8qi_uus ( __a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vmovq_n_u64 (uint64_t __a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshl_u16 (uint16x4_t __a, int16x4_t __b)
- {
--  return vdupq_n_u64 (__a);
-+  return __builtin_aarch64_uqrshlv4hi_uus ( __a, __b);
- }
- 
--/* vmul_lane  */
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshl_u32 (uint32x2_t __a, int32x2_t __b)
-+{
-+  return __builtin_aarch64_uqrshlv2si_uus ( __a, __b);
-+}
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vmul_lane_f32 (float32x2_t __a, float32x2_t __b, const int __lane)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshl_u64 (uint64x1_t __a, int64x1_t __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return (uint64x1_t) {__builtin_aarch64_uqrshldi_uus (__a[0], __b[0])};
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vmul_lane_f64 (float64x1_t __a, float64x1_t __b, const int __lane)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshlq_s8 (int8x16_t __a, int8x16_t __b)
- {
--  return __a * __b;
-+  return __builtin_aarch64_sqrshlv16qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vmul_lane_s16 (int16x4_t __a, int16x4_t __b, const int __lane)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshlq_s16 (int16x8_t __a, int16x8_t __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_sqrshlv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vmul_lane_s32 (int32x2_t __a, int32x2_t __b, const int __lane)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshlq_s32 (int32x4_t __a, int32x4_t __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_sqrshlv4si (__a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vmul_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __lane)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshlq_s64 (int64x2_t __a, int64x2_t __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_sqrshlv2di (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vmul_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __lane)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshlq_u8 (uint8x16_t __a, int8x16_t __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_uqrshlv16qi_uus ( __a, __b);
- }
- 
--/* vmuld_lane  */
--
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vmuld_lane_f64 (float64_t __a, float64x1_t __b, const int __lane)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshlq_u16 (uint16x8_t __a, int16x8_t __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_uqrshlv8hi_uus ( __a, __b);
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vmuld_laneq_f64 (float64_t __a, float64x2_t __b, const int __lane)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshlq_u32 (uint32x4_t __a, int32x4_t __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_uqrshlv4si_uus ( __a, __b);
- }
- 
--/* vmuls_lane  */
--
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vmuls_lane_f32 (float32_t __a, float32x2_t __b, const int __lane)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshlq_u64 (uint64x2_t __a, int64x2_t __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_uqrshlv2di_uus ( __a, __b);
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vmuls_laneq_f32 (float32_t __a, float32x4_t __b, const int __lane)
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshlb_s8 (int8_t __a, int8_t __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_sqrshlqi (__a, __b);
- }
- 
--/* vmul_laneq  */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vmul_laneq_f32 (float32x2_t __a, float32x4_t __b, const int __lane)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshlh_s16 (int16_t __a, int16_t __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_sqrshlhi (__a, __b);
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vmul_laneq_f64 (float64x1_t __a, float64x2_t __b, const int __lane)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshls_s32 (int32_t __a, int32_t __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_sqrshlsi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vmul_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __lane)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshld_s64 (int64_t __a, int64_t __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_sqrshldi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vmul_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __lane)
-+__extension__ extern __inline uint8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshlb_u8 (uint8_t __a, uint8_t __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_uqrshlqi_uus (__a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vmul_laneq_u16 (uint16x4_t __a, uint16x8_t __b, const int __lane)
-+__extension__ extern __inline uint16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshlh_u16 (uint16_t __a, uint16_t __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_uqrshlhi_uus (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vmul_laneq_u32 (uint32x2_t __a, uint32x4_t __b, const int __lane)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshls_u32 (uint32_t __a, uint32_t __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_uqrshlsi_uus (__a, __b);
- }
- 
--/* vmul_n  */
--
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vmul_n_f64  (float64x1_t __a, float64_t __b)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshld_u64 (uint64_t __a, uint64_t __b)
- {
--  return (float64x1_t) { vget_lane_f64 (__a, 0) * __b };
-+  return __builtin_aarch64_uqrshldi_uus (__a, __b);
- }
- 
--/* vmulq_lane  */
-+/* vqrshrn */
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vmulq_lane_f32 (float32x4_t __a, float32x2_t __b, const int __lane)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrn_n_s16 (int16x8_t __a, const int __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return (int8x8_t) __builtin_aarch64_sqrshrn_nv8hi (__a, __b);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vmulq_lane_f64 (float64x2_t __a, float64x1_t __b, const int __lane)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrn_n_s32 (int32x4_t __a, const int __b)
- {
--  __AARCH64_LANE_CHECK (__a, __lane);
--  return __a * __b[0];
-+  return (int16x4_t) __builtin_aarch64_sqrshrn_nv4si (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmulq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __lane)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrn_n_s64 (int64x2_t __a, const int __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return (int32x2_t) __builtin_aarch64_sqrshrn_nv2di (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmulq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __lane)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrn_n_u16 (uint16x8_t __a, const int __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_uqrshrn_nv8hi_uus ( __a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmulq_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __lane)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrn_n_u32 (uint32x4_t __a, const int __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_uqrshrn_nv4si_uus ( __a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmulq_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __lane)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrn_n_u64 (uint64x2_t __a, const int __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_uqrshrn_nv2di_uus ( __a, __b);
- }
- 
--/* vmulq_laneq  */
--
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vmulq_laneq_f32 (float32x4_t __a, float32x4_t __b, const int __lane)
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrnh_n_s16 (int16_t __a, const int __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return (int8_t) __builtin_aarch64_sqrshrn_nhi (__a, __b);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vmulq_laneq_f64 (float64x2_t __a, float64x2_t __b, const int __lane)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrns_n_s32 (int32_t __a, const int __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return (int16_t) __builtin_aarch64_sqrshrn_nsi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vmulq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __lane)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrnd_n_s64 (int64_t __a, const int __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return (int32_t) __builtin_aarch64_sqrshrn_ndi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vmulq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __lane)
-+__extension__ extern __inline uint8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrnh_n_u16 (uint16_t __a, const int __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_uqrshrn_nhi_uus (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vmulq_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __lane)
-+__extension__ extern __inline uint16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrns_n_u32 (uint32_t __a, const int __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_uqrshrn_nsi_uus (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __lane)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrnd_n_u64 (uint64_t __a, const int __b)
- {
--  return __a * __aarch64_vget_lane_any (__b, __lane);
-+  return __builtin_aarch64_uqrshrn_ndi_uus (__a, __b);
- }
- 
--/* vneg  */
-+/* vqrshrun */
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vneg_f32 (float32x2_t __a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrun_n_s16 (int16x8_t __a, const int __b)
- {
--  return -__a;
-+  return (uint8x8_t) __builtin_aarch64_sqrshrun_nv8hi (__a, __b);
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vneg_f64 (float64x1_t __a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrun_n_s32 (int32x4_t __a, const int __b)
- {
--  return -__a;
-+  return (uint16x4_t) __builtin_aarch64_sqrshrun_nv4si (__a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vneg_s8 (int8x8_t __a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrun_n_s64 (int64x2_t __a, const int __b)
- {
--  return -__a;
-+  return (uint32x2_t) __builtin_aarch64_sqrshrun_nv2di (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vneg_s16 (int16x4_t __a)
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrunh_n_s16 (int16_t __a, const int __b)
- {
--  return -__a;
-+  return (int8_t) __builtin_aarch64_sqrshrun_nhi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vneg_s32 (int32x2_t __a)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshruns_n_s32 (int32_t __a, const int __b)
- {
--  return -__a;
-+  return (int16_t) __builtin_aarch64_sqrshrun_nsi (__a, __b);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vneg_s64 (int64x1_t __a)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqrshrund_n_s64 (int64_t __a, const int __b)
- {
--  return -__a;
-+  return (int32_t) __builtin_aarch64_sqrshrun_ndi (__a, __b);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vnegq_f32 (float32x4_t __a)
-+/* vqshl */
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_s8 (int8x8_t __a, int8x8_t __b)
- {
--  return -__a;
-+  return __builtin_aarch64_sqshlv8qi (__a, __b);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vnegq_f64 (float64x2_t __a)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_s16 (int16x4_t __a, int16x4_t __b)
- {
--  return -__a;
-+  return __builtin_aarch64_sqshlv4hi (__a, __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vnegq_s8 (int8x16_t __a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_s32 (int32x2_t __a, int32x2_t __b)
- {
--  return -__a;
-+  return __builtin_aarch64_sqshlv2si (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vnegq_s16 (int16x8_t __a)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_s64 (int64x1_t __a, int64x1_t __b)
- {
--  return -__a;
-+  return (int64x1_t) {__builtin_aarch64_sqshldi (__a[0], __b[0])};
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vnegq_s32 (int32x4_t __a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_u8 (uint8x8_t __a, int8x8_t __b)
- {
--  return -__a;
-+  return __builtin_aarch64_uqshlv8qi_uus ( __a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vnegq_s64 (int64x2_t __a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_u16 (uint16x4_t __a, int16x4_t __b)
- {
--  return -__a;
-+  return __builtin_aarch64_uqshlv4hi_uus ( __a, __b);
- }
- 
--/* vpadd  */
--
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vpadd_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_u32 (uint32x2_t __a, int32x2_t __b)
- {
--  return __builtin_aarch64_addpv8qi (__a, __b);
-+  return __builtin_aarch64_uqshlv2si_uus ( __a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vpadd_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_u64 (uint64x1_t __a, int64x1_t __b)
- {
--  return __builtin_aarch64_addpv4hi (__a, __b);
-+  return (uint64x1_t) {__builtin_aarch64_uqshldi_uus (__a[0], __b[0])};
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vpadd_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_s8 (int8x16_t __a, int8x16_t __b)
- {
--  return __builtin_aarch64_addpv2si (__a, __b);
-+  return __builtin_aarch64_sqshlv16qi (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vpadd_u8 (uint8x8_t __a, uint8x8_t __b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_s16 (int16x8_t __a, int16x8_t __b)
- {
--  return (uint8x8_t) __builtin_aarch64_addpv8qi ((int8x8_t) __a,
--						 (int8x8_t) __b);
-+  return __builtin_aarch64_sqshlv8hi (__a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vpadd_u16 (uint16x4_t __a, uint16x4_t __b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_s32 (int32x4_t __a, int32x4_t __b)
- {
--  return (uint16x4_t) __builtin_aarch64_addpv4hi ((int16x4_t) __a,
--						  (int16x4_t) __b);
-+  return __builtin_aarch64_sqshlv4si (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vpadd_u32 (uint32x2_t __a, uint32x2_t __b)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_s64 (int64x2_t __a, int64x2_t __b)
- {
--  return (uint32x2_t) __builtin_aarch64_addpv2si ((int32x2_t) __a,
--						  (int32x2_t) __b);
-+  return __builtin_aarch64_sqshlv2di (__a, __b);
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vpaddd_f64 (float64x2_t __a)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_u8 (uint8x16_t __a, int8x16_t __b)
- {
--  return __builtin_aarch64_reduc_plus_scal_v2df (__a);
-+  return __builtin_aarch64_uqshlv16qi_uus ( __a, __b);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vpaddd_s64 (int64x2_t __a)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_u16 (uint16x8_t __a, int16x8_t __b)
- {
--  return __builtin_aarch64_addpdi (__a);
-+  return __builtin_aarch64_uqshlv8hi_uus ( __a, __b);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vpaddd_u64 (uint64x2_t __a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_u32 (uint32x4_t __a, int32x4_t __b)
- {
--  return __builtin_aarch64_addpdi ((int64x2_t) __a);
-+  return __builtin_aarch64_uqshlv4si_uus ( __a, __b);
- }
- 
--/* vqabs */
--
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqabsq_s64 (int64x2_t __a)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_u64 (uint64x2_t __a, int64x2_t __b)
- {
--  return (int64x2_t) __builtin_aarch64_sqabsv2di (__a);
-+  return __builtin_aarch64_uqshlv2di_uus ( __a, __b);
- }
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqabsb_s8 (int8_t __a)
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlb_s8 (int8_t __a, int8_t __b)
- {
--  return (int8_t) __builtin_aarch64_sqabsqi (__a);
-+  return __builtin_aarch64_sqshlqi (__a, __b);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqabsh_s16 (int16_t __a)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlh_s16 (int16_t __a, int16_t __b)
- {
--  return (int16_t) __builtin_aarch64_sqabshi (__a);
-+  return __builtin_aarch64_sqshlhi (__a, __b);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqabss_s32 (int32_t __a)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshls_s32 (int32_t __a, int32_t __b)
- {
--  return (int32_t) __builtin_aarch64_sqabssi (__a);
-+  return __builtin_aarch64_sqshlsi (__a, __b);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vqabsd_s64 (int64_t __a)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshld_s64 (int64_t __a, int64_t __b)
- {
--  return __builtin_aarch64_sqabsdi (__a);
-+  return __builtin_aarch64_sqshldi (__a, __b);
- }
- 
--/* vqadd */
--
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqaddb_s8 (int8_t __a, int8_t __b)
-+__extension__ extern __inline uint8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlb_u8 (uint8_t __a, uint8_t __b)
- {
--  return (int8_t) __builtin_aarch64_sqaddqi (__a, __b);
-+  return __builtin_aarch64_uqshlqi_uus (__a, __b);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqaddh_s16 (int16_t __a, int16_t __b)
-+__extension__ extern __inline uint16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlh_u16 (uint16_t __a, uint16_t __b)
- {
--  return (int16_t) __builtin_aarch64_sqaddhi (__a, __b);
-+  return __builtin_aarch64_uqshlhi_uus (__a, __b);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqadds_s32 (int32_t __a, int32_t __b)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshls_u32 (uint32_t __a, uint32_t __b)
- {
--  return (int32_t) __builtin_aarch64_sqaddsi (__a, __b);
-+  return __builtin_aarch64_uqshlsi_uus (__a, __b);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vqaddd_s64 (int64_t __a, int64_t __b)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshld_u64 (uint64_t __a, uint64_t __b)
- {
--  return __builtin_aarch64_sqadddi (__a, __b);
-+  return __builtin_aarch64_uqshldi_uus (__a, __b);
- }
- 
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vqaddb_u8 (uint8_t __a, uint8_t __b)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_n_s8 (int8x8_t __a, const int __b)
- {
--  return (uint8_t) __builtin_aarch64_uqaddqi_uuu (__a, __b);
-+  return (int8x8_t) __builtin_aarch64_sqshl_nv8qi (__a, __b);
- }
- 
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vqaddh_u16 (uint16_t __a, uint16_t __b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_n_s16 (int16x4_t __a, const int __b)
- {
--  return (uint16_t) __builtin_aarch64_uqaddhi_uuu (__a, __b);
-+  return (int16x4_t) __builtin_aarch64_sqshl_nv4hi (__a, __b);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vqadds_u32 (uint32_t __a, uint32_t __b)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_n_s32 (int32x2_t __a, const int __b)
- {
--  return (uint32_t) __builtin_aarch64_uqaddsi_uuu (__a, __b);
-+  return (int32x2_t) __builtin_aarch64_sqshl_nv2si (__a, __b);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vqaddd_u64 (uint64_t __a, uint64_t __b)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_n_s64 (int64x1_t __a, const int __b)
- {
--  return __builtin_aarch64_uqadddi_uuu (__a, __b);
-+  return (int64x1_t) {__builtin_aarch64_sqshl_ndi (__a[0], __b)};
- }
- 
--/* vqdmlal */
--
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_n_u8 (uint8x8_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlalv4hi (__a, __b, __c);
-+  return __builtin_aarch64_uqshl_nv8qi_uus (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmlal_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_n_u16 (uint16x4_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlal2v8hi (__a, __b, __c);
-+  return __builtin_aarch64_uqshl_nv4hi_uus (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmlal_high_lane_s16 (int32x4_t __a, int16x8_t __b, int16x4_t __c,
--		       int const __d)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_n_u32 (uint32x2_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlal2_lanev8hi (__a, __b, __c, __d);
-+  return __builtin_aarch64_uqshl_nv2si_uus (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmlal_high_laneq_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c,
--			int const __d)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshl_n_u64 (uint64x1_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlal2_laneqv8hi (__a, __b, __c, __d);
-+  return (uint64x1_t) {__builtin_aarch64_uqshl_ndi_uus (__a[0], __b)};
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmlal_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_n_s8 (int8x16_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlal2_nv8hi (__a, __b, __c);
-+  return (int8x16_t) __builtin_aarch64_sqshl_nv16qi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, int const __d)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_n_s16 (int16x8_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlal_lanev4hi (__a, __b, __c, __d);
-+  return (int16x8_t) __builtin_aarch64_sqshl_nv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmlal_laneq_s16 (int32x4_t __a, int16x4_t __b, int16x8_t __c, int const __d)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_n_s32 (int32x4_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlal_laneqv4hi (__a, __b, __c, __d);
-+  return (int32x4_t) __builtin_aarch64_sqshl_nv4si (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_n_s64 (int64x2_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlal_nv4hi (__a, __b, __c);
-+  return (int64x2_t) __builtin_aarch64_sqshl_nv2di (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_n_u8 (uint8x16_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlalv2si (__a, __b, __c);
-+  return __builtin_aarch64_uqshl_nv16qi_uus (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmlal_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_n_u16 (uint16x8_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlal2v4si (__a, __b, __c);
-+  return __builtin_aarch64_uqshl_nv8hi_uus (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmlal_high_lane_s32 (int64x2_t __a, int32x4_t __b, int32x2_t __c,
--		       int const __d)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_n_u32 (uint32x4_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlal2_lanev4si (__a, __b, __c, __d);
-+  return __builtin_aarch64_uqshl_nv4si_uus (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmlal_high_laneq_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c,
--			int const __d)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlq_n_u64 (uint64x2_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlal2_laneqv4si (__a, __b, __c, __d);
-+  return __builtin_aarch64_uqshl_nv2di_uus (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmlal_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c)
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlb_n_s8 (int8_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlal2_nv4si (__a, __b, __c);
-+  return (int8_t) __builtin_aarch64_sqshl_nqi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, int const __d)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlh_n_s16 (int16_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlal_lanev2si (__a, __b, __c, __d);
-+  return (int16_t) __builtin_aarch64_sqshl_nhi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmlal_laneq_s32 (int64x2_t __a, int32x2_t __b, int32x4_t __c, int const __d)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshls_n_s32 (int32_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlal_laneqv2si (__a, __b, __c, __d);
-+  return (int32_t) __builtin_aarch64_sqshl_nsi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshld_n_s64 (int64_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlal_nv2si (__a, __b, __c);
-+  return __builtin_aarch64_sqshl_ndi (__a, __b);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqdmlalh_s16 (int32_t __a, int16_t __b, int16_t __c)
-+__extension__ extern __inline uint8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlb_n_u8 (uint8_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlalhi (__a, __b, __c);
-+  return __builtin_aarch64_uqshl_nqi_uus (__a, __b);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqdmlalh_lane_s16 (int32_t __a, int16_t __b, int16x4_t __c, const int __d)
-+__extension__ extern __inline uint16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlh_n_u16 (uint16_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlal_lanehi (__a, __b, __c, __d);
-+  return __builtin_aarch64_uqshl_nhi_uus (__a, __b);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqdmlalh_laneq_s16 (int32_t __a, int16_t __b, int16x8_t __c, const int __d)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshls_n_u32 (uint32_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlal_laneqhi (__a, __b, __c, __d);
-+  return __builtin_aarch64_uqshl_nsi_uus (__a, __b);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vqdmlals_s32 (int64_t __a, int32_t __b, int32_t __c)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshld_n_u64 (uint64_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlalsi (__a, __b, __c);
-+  return __builtin_aarch64_uqshl_ndi_uus (__a, __b);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vqdmlals_lane_s32 (int64_t __a, int32_t __b, int32x2_t __c, const int __d)
-+/* vqshlu */
-+
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlu_n_s8 (int8x8_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlal_lanesi (__a, __b, __c, __d);
-+  return __builtin_aarch64_sqshlu_nv8qi_uss (__a, __b);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vqdmlals_laneq_s32 (int64_t __a, int32_t __b, int32x4_t __c, const int __d)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlu_n_s16 (int16x4_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlal_laneqsi (__a, __b, __c, __d);
-+  return __builtin_aarch64_sqshlu_nv4hi_uss (__a, __b);
- }
- 
--/* vqdmlsl */
--
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlu_n_s32 (int32x2_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlslv4hi (__a, __b, __c);
-+  return __builtin_aarch64_sqshlu_nv2si_uss (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmlsl_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlu_n_s64 (int64x1_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlsl2v8hi (__a, __b, __c);
-+  return (uint64x1_t) {__builtin_aarch64_sqshlu_ndi_uss (__a[0], __b)};
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmlsl_high_lane_s16 (int32x4_t __a, int16x8_t __b, int16x4_t __c,
--		       int const __d)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshluq_n_s8 (int8x16_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlsl2_lanev8hi (__a, __b, __c, __d);
-+  return __builtin_aarch64_sqshlu_nv16qi_uss (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmlsl_high_laneq_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c,
--			int const __d)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshluq_n_s16 (int16x8_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlsl2_laneqv8hi (__a, __b, __c, __d);
-+  return __builtin_aarch64_sqshlu_nv8hi_uss (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmlsl_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshluq_n_s32 (int32x4_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlsl2_nv8hi (__a, __b, __c);
-+  return __builtin_aarch64_sqshlu_nv4si_uss (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, int const __d)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshluq_n_s64 (int64x2_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlsl_lanev4hi (__a, __b, __c, __d);
-+  return __builtin_aarch64_sqshlu_nv2di_uss (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmlsl_laneq_s16 (int32x4_t __a, int16x4_t __b, int16x8_t __c, int const __d)
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlub_n_s8 (int8_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlsl_laneqv4hi (__a, __b, __c, __d);
-+  return (int8_t) __builtin_aarch64_sqshlu_nqi_uss (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshluh_n_s16 (int16_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlsl_nv4hi (__a, __b, __c);
-+  return (int16_t) __builtin_aarch64_sqshlu_nhi_uss (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlus_n_s32 (int32_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlslv2si (__a, __b, __c);
-+  return (int32_t) __builtin_aarch64_sqshlu_nsi_uss (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmlsl_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshlud_n_s64 (int64_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlsl2v4si (__a, __b, __c);
-+  return __builtin_aarch64_sqshlu_ndi_uss (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmlsl_high_lane_s32 (int64x2_t __a, int32x4_t __b, int32x2_t __c,
--		       int const __d)
-+/* vqshrn */
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrn_n_s16 (int16x8_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlsl2_lanev4si (__a, __b, __c, __d);
-+  return (int8x8_t) __builtin_aarch64_sqshrn_nv8hi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmlsl_high_laneq_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c,
--			int const __d)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrn_n_s32 (int32x4_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlsl2_laneqv4si (__a, __b, __c, __d);
-+  return (int16x4_t) __builtin_aarch64_sqshrn_nv4si (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmlsl_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrn_n_s64 (int64x2_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlsl2_nv4si (__a, __b, __c);
-+  return (int32x2_t) __builtin_aarch64_sqshrn_nv2di (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, int const __d)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrn_n_u16 (uint16x8_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlsl_lanev2si (__a, __b, __c, __d);
-+  return __builtin_aarch64_uqshrn_nv8hi_uus ( __a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmlsl_laneq_s32 (int64x2_t __a, int32x2_t __b, int32x4_t __c, int const __d)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrn_n_u32 (uint32x4_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlsl_laneqv2si (__a, __b, __c, __d);
-+  return __builtin_aarch64_uqshrn_nv4si_uus ( __a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrn_n_u64 (uint64x2_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlsl_nv2si (__a, __b, __c);
-+  return __builtin_aarch64_uqshrn_nv2di_uus ( __a, __b);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqdmlslh_s16 (int32_t __a, int16_t __b, int16_t __c)
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrnh_n_s16 (int16_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlslhi (__a, __b, __c);
-+  return (int8_t) __builtin_aarch64_sqshrn_nhi (__a, __b);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqdmlslh_lane_s16 (int32_t __a, int16_t __b, int16x4_t __c, const int __d)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrns_n_s32 (int32_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlsl_lanehi (__a, __b, __c, __d);
-+  return (int16_t) __builtin_aarch64_sqshrn_nsi (__a, __b);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqdmlslh_laneq_s16 (int32_t __a, int16_t __b, int16x8_t __c, const int __d)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrnd_n_s64 (int64_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlsl_laneqhi (__a, __b, __c, __d);
-+  return (int32_t) __builtin_aarch64_sqshrn_ndi (__a, __b);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vqdmlsls_s32 (int64_t __a, int32_t __b, int32_t __c)
-+__extension__ extern __inline uint8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrnh_n_u16 (uint16_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlslsi (__a, __b, __c);
-+  return __builtin_aarch64_uqshrn_nhi_uus (__a, __b);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vqdmlsls_lane_s32 (int64_t __a, int32_t __b, int32x2_t __c, const int __d)
-+__extension__ extern __inline uint16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrns_n_u32 (uint32_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlsl_lanesi (__a, __b, __c, __d);
-+  return __builtin_aarch64_uqshrn_nsi_uus (__a, __b);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vqdmlsls_laneq_s32 (int64_t __a, int32_t __b, int32x4_t __c, const int __d)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrnd_n_u64 (uint64_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmlsl_laneqsi (__a, __b, __c, __d);
-+  return __builtin_aarch64_uqshrn_ndi_uus (__a, __b);
- }
- 
--/* vqdmulh */
-+/* vqshrun */
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrun_n_s16 (int16x8_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmulh_lanev4hi (__a, __b, __c);
-+  return (uint8x8_t) __builtin_aarch64_sqshrun_nv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrun_n_s32 (int32x4_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmulh_lanev2si (__a, __b, __c);
-+  return (uint16x4_t) __builtin_aarch64_sqshrun_nv4si (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vqdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrun_n_s64 (int64x2_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmulh_lanev8hi (__a, __b, __c);
-+  return (uint32x2_t) __builtin_aarch64_sqshrun_nv2di (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrunh_n_s16 (int16_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmulh_lanev4si (__a, __b, __c);
-+  return (int8_t) __builtin_aarch64_sqshrun_nhi (__a, __b);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqdmulhh_s16 (int16_t __a, int16_t __b)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshruns_n_s32 (int32_t __a, const int __b)
- {
--  return (int16_t) __builtin_aarch64_sqdmulhhi (__a, __b);
-+  return (int16_t) __builtin_aarch64_sqshrun_nsi (__a, __b);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqdmulhh_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqshrund_n_s64 (int64_t __a, const int __b)
- {
--  return __builtin_aarch64_sqdmulh_lanehi (__a, __b, __c);
-+  return (int32_t) __builtin_aarch64_sqshrun_ndi (__a, __b);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqdmulhh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c)
-+/* vqsub */
-+
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqsubb_s8 (int8_t __a, int8_t __b)
- {
--  return __builtin_aarch64_sqdmulh_laneqhi (__a, __b, __c);
-+  return (int8_t) __builtin_aarch64_sqsubqi (__a, __b);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqdmulhs_s32 (int32_t __a, int32_t __b)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqsubh_s16 (int16_t __a, int16_t __b)
- {
--  return (int32_t) __builtin_aarch64_sqdmulhsi (__a, __b);
-+  return (int16_t) __builtin_aarch64_sqsubhi (__a, __b);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqdmulhs_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqsubs_s32 (int32_t __a, int32_t __b)
- {
--  return __builtin_aarch64_sqdmulh_lanesi (__a, __b, __c);
-+  return (int32_t) __builtin_aarch64_sqsubsi (__a, __b);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqdmulhs_laneq_s32 (int32_t __a, int32x4_t __b, const int __c)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqsubd_s64 (int64_t __a, int64_t __b)
- {
--  return __builtin_aarch64_sqdmulh_laneqsi (__a, __b, __c);
-+  return __builtin_aarch64_sqsubdi (__a, __b);
- }
- 
--/* vqdmull */
--
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmull_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline uint8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqsubb_u8 (uint8_t __a, uint8_t __b)
- {
--  return __builtin_aarch64_sqdmullv4hi (__a, __b);
-+  return (uint8_t) __builtin_aarch64_uqsubqi_uuu (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmull_high_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline uint16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqsubh_u16 (uint16_t __a, uint16_t __b)
- {
--  return __builtin_aarch64_sqdmull2v8hi (__a, __b);
-+  return (uint16_t) __builtin_aarch64_uqsubhi_uuu (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, int const __c)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqsubs_u32 (uint32_t __a, uint32_t __b)
- {
--  return __builtin_aarch64_sqdmull2_lanev8hi (__a, __b,__c);
-+  return (uint32_t) __builtin_aarch64_uqsubsi_uuu (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, int const __c)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqsubd_u64 (uint64_t __a, uint64_t __b)
- {
--  return __builtin_aarch64_sqdmull2_laneqv8hi (__a, __b,__c);
-+  return __builtin_aarch64_uqsubdi_uuu (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmull_high_n_s16 (int16x8_t __a, int16_t __b)
-+/* vqtbl2 */
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbl2_s8 (int8x16x2_t tab, uint8x8_t idx)
- {
--  return __builtin_aarch64_sqdmull2_nv8hi (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
-+  return __builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmull_lane_s16 (int16x4_t __a, int16x4_t __b, int const __c)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbl2_u8 (uint8x16x2_t tab, uint8x8_t idx)
- {
--  return __builtin_aarch64_sqdmull_lanev4hi (__a, __b, __c);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  return (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmull_laneq_s16 (int16x4_t __a, int16x8_t __b, int const __c)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbl2_p8 (poly8x16x2_t tab, uint8x8_t idx)
- {
--  return __builtin_aarch64_sqdmull_laneqv4hi (__a, __b, __c);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  return (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqdmull_n_s16 (int16x4_t __a, int16_t __b)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbl2q_s8 (int8x16x2_t tab, uint8x16_t idx)
- {
--  return __builtin_aarch64_sqdmull_nv4hi (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  return __builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmull_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbl2q_u8 (uint8x16x2_t tab, uint8x16_t idx)
- {
--  return __builtin_aarch64_sqdmullv2si (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  return (uint8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmull_high_s32 (int32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbl2q_p8 (poly8x16x2_t tab, uint8x16_t idx)
- {
--  return __builtin_aarch64_sqdmull2v4si (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  return (poly8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, int const __c)
-+/* vqtbl3 */
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbl3_s8 (int8x16x3_t tab, uint8x8_t idx)
- {
--  return __builtin_aarch64_sqdmull2_lanev4si (__a, __b, __c);
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-+  return __builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, int const __c)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbl3_u8 (uint8x16x3_t tab, uint8x8_t idx)
- {
--  return __builtin_aarch64_sqdmull2_laneqv4si (__a, __b, __c);
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-+  return (uint8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmull_high_n_s32 (int32x4_t __a, int32_t __b)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbl3_p8 (poly8x16x3_t tab, uint8x8_t idx)
- {
--  return __builtin_aarch64_sqdmull2_nv4si (__a, __b);
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-+  return (poly8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmull_lane_s32 (int32x2_t __a, int32x2_t __b, int const __c)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbl3q_s8 (int8x16x3_t tab, uint8x16_t idx)
- {
--  return __builtin_aarch64_sqdmull_lanev2si (__a, __b, __c);
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-+  return __builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmull_laneq_s32 (int32x2_t __a, int32x4_t __b, int const __c)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbl3q_u8 (uint8x16x3_t tab, uint8x16_t idx)
- {
--  return __builtin_aarch64_sqdmull_laneqv2si (__a, __b, __c);
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-+  return (uint8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqdmull_n_s32 (int32x2_t __a, int32_t __b)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbl3q_p8 (poly8x16x3_t tab, uint8x16_t idx)
- {
--  return __builtin_aarch64_sqdmull_nv2si (__a, __b);
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-+  return (poly8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqdmullh_s16 (int16_t __a, int16_t __b)
-+/* vqtbl4 */
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbl4_s8 (int8x16x4_t tab, uint8x8_t idx)
- {
--  return (int32_t) __builtin_aarch64_sqdmullhi (__a, __b);
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-+  return __builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqdmullh_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbl4_u8 (uint8x16x4_t tab, uint8x8_t idx)
- {
--  return __builtin_aarch64_sqdmull_lanehi (__a, __b, __c);
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-+  return (uint8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqdmullh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbl4_p8 (poly8x16x4_t tab, uint8x8_t idx)
- {
--  return __builtin_aarch64_sqdmull_laneqhi (__a, __b, __c);
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-+  return (poly8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vqdmulls_s32 (int32_t __a, int32_t __b)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbl4q_s8 (int8x16x4_t tab, uint8x16_t idx)
- {
--  return __builtin_aarch64_sqdmullsi (__a, __b);
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-+  return __builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vqdmulls_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbl4q_u8 (uint8x16x4_t tab, uint8x16_t idx)
- {
--  return __builtin_aarch64_sqdmull_lanesi (__a, __b, __c);
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-+  return (uint8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vqdmulls_laneq_s32 (int32_t __a, int32x4_t __b, const int __c)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbl4q_p8 (poly8x16x4_t tab, uint8x16_t idx)
- {
--  return __builtin_aarch64_sqdmull_laneqsi (__a, __b, __c);
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-+  return (poly8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx);
- }
- 
--/* vqmovn */
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vqmovn_s16 (int16x8_t __a)
-+/* vqtbx2 */
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbx2_s8 (int8x8_t r, int8x16x2_t tab, uint8x8_t idx)
- {
--  return (int8x8_t) __builtin_aarch64_sqmovnv8hi (__a);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
-+  return __builtin_aarch64_tbx4v8qi (r, __o, (int8x8_t)idx);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqmovn_s32 (int32x4_t __a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbx2_u8 (uint8x8_t r, uint8x16x2_t tab, uint8x8_t idx)
- {
--  return (int16x4_t) __builtin_aarch64_sqmovnv4si (__a);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  return (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)r, __o,
-+						(int8x8_t)idx);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqmovn_s64 (int64x2_t __a)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbx2_p8 (poly8x8_t r, poly8x16x2_t tab, uint8x8_t idx)
- {
--  return (int32x2_t) __builtin_aarch64_sqmovnv2di (__a);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  return (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)r, __o,
-+						(int8x8_t)idx);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqmovn_u16 (uint16x8_t __a)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbx2q_s8 (int8x16_t r, int8x16x2_t tab, uint8x16_t idx)
- {
--  return (uint8x8_t) __builtin_aarch64_uqmovnv8hi ((int16x8_t) __a);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
-+  return __builtin_aarch64_tbx4v16qi (r, __o, (int8x16_t)idx);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vqmovn_u32 (uint32x4_t __a)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbx2q_u8 (uint8x16_t r, uint8x16x2_t tab, uint8x16_t idx)
- {
--  return (uint16x4_t) __builtin_aarch64_uqmovnv4si ((int32x4_t) __a);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  return (uint8x16_t)__builtin_aarch64_tbx4v16qi ((int8x16_t)r, __o,
-+						  (int8x16_t)idx);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vqmovn_u64 (uint64x2_t __a)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbx2q_p8 (poly8x16_t r, poly8x16x2_t tab, uint8x16_t idx)
- {
--  return (uint32x2_t) __builtin_aarch64_uqmovnv2di ((int64x2_t) __a);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  return (poly8x16_t)__builtin_aarch64_tbx4v16qi ((int8x16_t)r, __o,
-+						  (int8x16_t)idx);
- }
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqmovnh_s16 (int16_t __a)
-+/* vqtbx3 */
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbx3_s8 (int8x8_t r, int8x16x3_t tab, uint8x8_t idx)
- {
--  return (int8_t) __builtin_aarch64_sqmovnhi (__a);
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[2], 2);
-+  return __builtin_aarch64_qtbx3v8qi (r, __o, (int8x8_t)idx);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqmovns_s32 (int32_t __a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbx3_u8 (uint8x8_t r, uint8x16x3_t tab, uint8x8_t idx)
- {
--  return (int16_t) __builtin_aarch64_sqmovnsi (__a);
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-+  return (uint8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)r, __o,
-+						 (int8x8_t)idx);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqmovnd_s64 (int64_t __a)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbx3_p8 (poly8x8_t r, poly8x16x3_t tab, uint8x8_t idx)
- {
--  return (int32_t) __builtin_aarch64_sqmovndi (__a);
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-+  return (poly8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)r, __o,
-+						 (int8x8_t)idx);
- }
- 
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vqmovnh_u16 (uint16_t __a)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbx3q_s8 (int8x16_t r, int8x16x3_t tab, uint8x16_t idx)
- {
--  return (uint8_t) __builtin_aarch64_uqmovnhi (__a);
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[2], 2);
-+  return __builtin_aarch64_qtbx3v16qi (r, __o, (int8x16_t)idx);
- }
- 
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vqmovns_u32 (uint32_t __a)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbx3q_u8 (uint8x16_t r, uint8x16x3_t tab, uint8x16_t idx)
- {
--  return (uint16_t) __builtin_aarch64_uqmovnsi (__a);
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-+  return (uint8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)r, __o,
-+						   (int8x16_t)idx);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vqmovnd_u64 (uint64_t __a)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbx3q_p8 (poly8x16_t r, poly8x16x3_t tab, uint8x16_t idx)
- {
--  return (uint32_t) __builtin_aarch64_uqmovndi (__a);
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-+  return (poly8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)r, __o,
-+						   (int8x16_t)idx);
- }
- 
--/* vqmovun */
-+/* vqtbx4 */
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqmovun_s16 (int16x8_t __a)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbx4_s8 (int8x8_t r, int8x16x4_t tab, uint8x8_t idx)
- {
--  return (uint8x8_t) __builtin_aarch64_sqmovunv8hi (__a);
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[3], 3);
-+  return __builtin_aarch64_qtbx4v8qi (r, __o, (int8x8_t)idx);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vqmovun_s32 (int32x4_t __a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbx4_u8 (uint8x8_t r, uint8x16x4_t tab, uint8x8_t idx)
- {
--  return (uint16x4_t) __builtin_aarch64_sqmovunv4si (__a);
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-+  return (uint8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)r, __o,
-+						 (int8x8_t)idx);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vqmovun_s64 (int64x2_t __a)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbx4_p8 (poly8x8_t r, poly8x16x4_t tab, uint8x8_t idx)
- {
--  return (uint32x2_t) __builtin_aarch64_sqmovunv2di (__a);
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-+  return (poly8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)r, __o,
-+						 (int8x8_t)idx);
- }
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqmovunh_s16 (int16_t __a)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbx4q_s8 (int8x16_t r, int8x16x4_t tab, uint8x16_t idx)
- {
--  return (int8_t) __builtin_aarch64_sqmovunhi (__a);
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[3], 3);
-+  return __builtin_aarch64_qtbx4v16qi (r, __o, (int8x16_t)idx);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqmovuns_s32 (int32_t __a)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbx4q_u8 (uint8x16_t r, uint8x16x4_t tab, uint8x16_t idx)
- {
--  return (int16_t) __builtin_aarch64_sqmovunsi (__a);
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-+  return (uint8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)r, __o,
-+						   (int8x16_t)idx);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqmovund_s64 (int64_t __a)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vqtbx4q_p8 (poly8x16_t r, poly8x16x4_t tab, uint8x16_t idx)
- {
--  return (int32_t) __builtin_aarch64_sqmovundi (__a);
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-+  return (poly8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)r, __o,
-+						   (int8x16_t)idx);
- }
- 
--/* vqneg */
-+/* vrbit  */
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqnegq_s64 (int64x2_t __a)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrbit_p8 (poly8x8_t __a)
- {
--  return (int64x2_t) __builtin_aarch64_sqnegv2di (__a);
-+  return (poly8x8_t) __builtin_aarch64_rbitv8qi ((int8x8_t) __a);
- }
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqnegb_s8 (int8_t __a)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrbit_s8 (int8x8_t __a)
- {
--  return (int8_t) __builtin_aarch64_sqnegqi (__a);
-+  return __builtin_aarch64_rbitv8qi (__a);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqnegh_s16 (int16_t __a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrbit_u8 (uint8x8_t __a)
- {
--  return (int16_t) __builtin_aarch64_sqneghi (__a);
-+  return (uint8x8_t) __builtin_aarch64_rbitv8qi ((int8x8_t) __a);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqnegs_s32 (int32_t __a)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrbitq_p8 (poly8x16_t __a)
- {
--  return (int32_t) __builtin_aarch64_sqnegsi (__a);
-+  return (poly8x16_t) __builtin_aarch64_rbitv16qi ((int8x16_t)__a);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vqnegd_s64 (int64_t __a)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrbitq_s8 (int8x16_t __a)
- {
--  return __builtin_aarch64_sqnegdi (__a);
-+  return __builtin_aarch64_rbitv16qi (__a);
- }
- 
--/* vqrdmulh */
--
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqrdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrbitq_u8 (uint8x16_t __a)
- {
--  return  __builtin_aarch64_sqrdmulh_lanev4hi (__a, __b, __c);
-+  return (uint8x16_t) __builtin_aarch64_rbitv16qi ((int8x16_t) __a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqrdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
-+/* vrecpe  */
-+
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrecpe_u32 (uint32x2_t __a)
- {
--  return __builtin_aarch64_sqrdmulh_lanev2si (__a, __b, __c);
-+  return (uint32x2_t) __builtin_aarch64_urecpev2si ((int32x2_t) __a);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vqrdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrecpeq_u32 (uint32x4_t __a)
- {
--  return __builtin_aarch64_sqrdmulh_lanev8hi (__a, __b, __c);
-+  return (uint32x4_t) __builtin_aarch64_urecpev4si ((int32x4_t) __a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqrdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrecpes_f32 (float32_t __a)
- {
--  return __builtin_aarch64_sqrdmulh_lanev4si (__a, __b, __c);
-+  return __builtin_aarch64_frecpesf (__a);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqrdmulhh_s16 (int16_t __a, int16_t __b)
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrecped_f64 (float64_t __a)
- {
--  return (int16_t) __builtin_aarch64_sqrdmulhhi (__a, __b);
-+  return __builtin_aarch64_frecpedf (__a);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqrdmulhh_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrecpe_f32 (float32x2_t __a)
- {
--  return __builtin_aarch64_sqrdmulh_lanehi (__a, __b, __c);
-+  return __builtin_aarch64_frecpev2sf (__a);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqrdmulhh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrecpe_f64 (float64x1_t __a)
- {
--  return __builtin_aarch64_sqrdmulh_laneqhi (__a, __b, __c);
-+  return (float64x1_t) { vrecped_f64 (vget_lane_f64 (__a, 0)) };
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqrdmulhs_s32 (int32_t __a, int32_t __b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrecpeq_f32 (float32x4_t __a)
- {
--  return (int32_t) __builtin_aarch64_sqrdmulhsi (__a, __b);
-+  return __builtin_aarch64_frecpev4sf (__a);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqrdmulhs_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrecpeq_f64 (float64x2_t __a)
- {
--  return __builtin_aarch64_sqrdmulh_lanesi (__a, __b, __c);
-+  return __builtin_aarch64_frecpev2df (__a);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqrdmulhs_laneq_s32 (int32_t __a, int32x4_t __b, const int __c)
-+/* vrecps  */
-+
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrecpss_f32 (float32_t __a, float32_t __b)
- {
--  return __builtin_aarch64_sqrdmulh_laneqsi (__a, __b, __c);
-+  return __builtin_aarch64_frecpssf (__a, __b);
- }
- 
--/* vqrshl */
--
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vqrshl_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrecpsd_f64 (float64_t __a, float64_t __b)
- {
--  return __builtin_aarch64_sqrshlv8qi (__a, __b);
-+  return __builtin_aarch64_frecpsdf (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqrshl_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrecps_f32 (float32x2_t __a, float32x2_t __b)
- {
--  return __builtin_aarch64_sqrshlv4hi (__a, __b);
-+  return __builtin_aarch64_frecpsv2sf (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqrshl_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrecps_f64 (float64x1_t __a, float64x1_t __b)
- {
--  return __builtin_aarch64_sqrshlv2si (__a, __b);
-+  return (float64x1_t) { vrecpsd_f64 (vget_lane_f64 (__a, 0),
-+				      vget_lane_f64 (__b, 0)) };
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vqrshl_s64 (int64x1_t __a, int64x1_t __b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrecpsq_f32 (float32x4_t __a, float32x4_t __b)
- {
--  return (int64x1_t) {__builtin_aarch64_sqrshldi (__a[0], __b[0])};
-+  return __builtin_aarch64_frecpsv4sf (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqrshl_u8 (uint8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrecpsq_f64 (float64x2_t __a, float64x2_t __b)
- {
--  return __builtin_aarch64_uqrshlv8qi_uus ( __a, __b);
-+  return __builtin_aarch64_frecpsv2df (__a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vqrshl_u16 (uint16x4_t __a, int16x4_t __b)
-+/* vrecpx  */
-+
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrecpxs_f32 (float32_t __a)
- {
--  return __builtin_aarch64_uqrshlv4hi_uus ( __a, __b);
-+  return __builtin_aarch64_frecpxsf (__a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vqrshl_u32 (uint32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrecpxd_f64 (float64_t __a)
- {
--  return __builtin_aarch64_uqrshlv2si_uus ( __a, __b);
-+  return __builtin_aarch64_frecpxdf (__a);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vqrshl_u64 (uint64x1_t __a, int64x1_t __b)
-+
-+/* vrev  */
-+
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev16_p8 (poly8x8_t a)
- {
--  return (uint64x1_t) {__builtin_aarch64_uqrshldi_uus (__a[0], __b[0])};
-+  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vqrshlq_s8 (int8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev16_s8 (int8x8_t a)
- {
--  return __builtin_aarch64_sqrshlv16qi (__a, __b);
-+  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vqrshlq_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev16_u8 (uint8x8_t a)
- {
--  return __builtin_aarch64_sqrshlv8hi (__a, __b);
-+  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqrshlq_s32 (int32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev16q_p8 (poly8x16_t a)
- {
--  return __builtin_aarch64_sqrshlv4si (__a, __b);
-+  return __builtin_shuffle (a,
-+      (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqrshlq_s64 (int64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev16q_s8 (int8x16_t a)
- {
--  return __builtin_aarch64_sqrshlv2di (__a, __b);
-+  return __builtin_shuffle (a,
-+      (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vqrshlq_u8 (uint8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev16q_u8 (uint8x16_t a)
- {
--  return __builtin_aarch64_uqrshlv16qi_uus ( __a, __b);
-+  return __builtin_shuffle (a,
-+      (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vqrshlq_u16 (uint16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev32_p8 (poly8x8_t a)
- {
--  return __builtin_aarch64_uqrshlv8hi_uus ( __a, __b);
-+  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vqrshlq_u32 (uint32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev32_p16 (poly16x4_t a)
- {
--  return __builtin_aarch64_uqrshlv4si_uus ( __a, __b);
-+  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vqrshlq_u64 (uint64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev32_s8 (int8x8_t a)
- {
--  return __builtin_aarch64_uqrshlv2di_uus ( __a, __b);
-+  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
- }
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqrshlb_s8 (int8_t __a, int8_t __b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev32_s16 (int16x4_t a)
- {
--  return __builtin_aarch64_sqrshlqi (__a, __b);
-+  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqrshlh_s16 (int16_t __a, int16_t __b)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev32_u8 (uint8x8_t a)
- {
--  return __builtin_aarch64_sqrshlhi (__a, __b);
-+  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqrshls_s32 (int32_t __a, int32_t __b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev32_u16 (uint16x4_t a)
- {
--  return __builtin_aarch64_sqrshlsi (__a, __b);
-+  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vqrshld_s64 (int64_t __a, int64_t __b)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev32q_p8 (poly8x16_t a)
- {
--  return __builtin_aarch64_sqrshldi (__a, __b);
-+  return __builtin_shuffle (a,
-+      (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
- }
- 
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vqrshlb_u8 (uint8_t __a, uint8_t __b)
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev32q_p16 (poly16x8_t a)
- {
--  return __builtin_aarch64_uqrshlqi_uus (__a, __b);
-+  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
- }
- 
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vqrshlh_u16 (uint16_t __a, uint16_t __b)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev32q_s8 (int8x16_t a)
- {
--  return __builtin_aarch64_uqrshlhi_uus (__a, __b);
-+  return __builtin_shuffle (a,
-+      (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vqrshls_u32 (uint32_t __a, uint32_t __b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev32q_s16 (int16x8_t a)
- {
--  return __builtin_aarch64_uqrshlsi_uus (__a, __b);
-+  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vqrshld_u64 (uint64_t __a, uint64_t __b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev32q_u8 (uint8x16_t a)
- {
--  return __builtin_aarch64_uqrshldi_uus (__a, __b);
-+  return __builtin_shuffle (a,
-+      (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
- }
- 
--/* vqrshrn */
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev32q_u16 (uint16x8_t a)
-+{
-+  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
-+}
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vqrshrn_n_s16 (int16x8_t __a, const int __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev64_f16 (float16x4_t __a)
- {
--  return (int8x8_t) __builtin_aarch64_sqrshrn_nv8hi (__a, __b);
-+  return __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 });
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqrshrn_n_s32 (int32x4_t __a, const int __b)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev64_f32 (float32x2_t a)
- {
--  return (int16x4_t) __builtin_aarch64_sqrshrn_nv4si (__a, __b);
-+  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqrshrn_n_s64 (int64x2_t __a, const int __b)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev64_p8 (poly8x8_t a)
- {
--  return (int32x2_t) __builtin_aarch64_sqrshrn_nv2di (__a, __b);
-+  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqrshrn_n_u16 (uint16x8_t __a, const int __b)
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev64_p16 (poly16x4_t a)
- {
--  return __builtin_aarch64_uqrshrn_nv8hi_uus ( __a, __b);
-+  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vqrshrn_n_u32 (uint32x4_t __a, const int __b)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev64_s8 (int8x8_t a)
- {
--  return __builtin_aarch64_uqrshrn_nv4si_uus ( __a, __b);
-+  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vqrshrn_n_u64 (uint64x2_t __a, const int __b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev64_s16 (int16x4_t a)
- {
--  return __builtin_aarch64_uqrshrn_nv2di_uus ( __a, __b);
-+  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
- }
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqrshrnh_n_s16 (int16_t __a, const int __b)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev64_s32 (int32x2_t a)
- {
--  return (int8_t) __builtin_aarch64_sqrshrn_nhi (__a, __b);
-+  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqrshrns_n_s32 (int32_t __a, const int __b)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev64_u8 (uint8x8_t a)
- {
--  return (int16_t) __builtin_aarch64_sqrshrn_nsi (__a, __b);
-+  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqrshrnd_n_s64 (int64_t __a, const int __b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev64_u16 (uint16x4_t a)
- {
--  return (int32_t) __builtin_aarch64_sqrshrn_ndi (__a, __b);
-+  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
- }
- 
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vqrshrnh_n_u16 (uint16_t __a, const int __b)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev64_u32 (uint32x2_t a)
- {
--  return __builtin_aarch64_uqrshrn_nhi_uus (__a, __b);
-+  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
- }
- 
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vqrshrns_n_u32 (uint32_t __a, const int __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev64q_f16 (float16x8_t __a)
- {
--  return __builtin_aarch64_uqrshrn_nsi_uus (__a, __b);
-+  return __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vqrshrnd_n_u64 (uint64_t __a, const int __b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev64q_f32 (float32x4_t a)
- {
--  return __builtin_aarch64_uqrshrn_ndi_uus (__a, __b);
-+  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
- }
- 
--/* vqrshrun */
--
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqrshrun_n_s16 (int16x8_t __a, const int __b)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev64q_p8 (poly8x16_t a)
- {
--  return (uint8x8_t) __builtin_aarch64_sqrshrun_nv8hi (__a, __b);
-+  return __builtin_shuffle (a,
-+      (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vqrshrun_n_s32 (int32x4_t __a, const int __b)
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev64q_p16 (poly16x8_t a)
- {
--  return (uint16x4_t) __builtin_aarch64_sqrshrun_nv4si (__a, __b);
-+  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vqrshrun_n_s64 (int64x2_t __a, const int __b)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev64q_s8 (int8x16_t a)
- {
--  return (uint32x2_t) __builtin_aarch64_sqrshrun_nv2di (__a, __b);
-+  return __builtin_shuffle (a,
-+      (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
- }
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqrshrunh_n_s16 (int16_t __a, const int __b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev64q_s16 (int16x8_t a)
- {
--  return (int8_t) __builtin_aarch64_sqrshrun_nhi (__a, __b);
-+  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqrshruns_n_s32 (int32_t __a, const int __b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev64q_s32 (int32x4_t a)
- {
--  return (int16_t) __builtin_aarch64_sqrshrun_nsi (__a, __b);
-+  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqrshrund_n_s64 (int64_t __a, const int __b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev64q_u8 (uint8x16_t a)
- {
--  return (int32_t) __builtin_aarch64_sqrshrun_ndi (__a, __b);
-+  return __builtin_shuffle (a,
-+      (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
- }
- 
--/* vqshl */
--
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vqshl_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev64q_u16 (uint16x8_t a)
- {
--  return __builtin_aarch64_sqshlv8qi (__a, __b);
-+  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqshl_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev64q_u32 (uint32x4_t a)
- {
--  return __builtin_aarch64_sqshlv4hi (__a, __b);
-+  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqshl_s32 (int32x2_t __a, int32x2_t __b)
-+/* vrnd  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrnd_f32 (float32x2_t __a)
- {
--  return __builtin_aarch64_sqshlv2si (__a, __b);
-+  return __builtin_aarch64_btruncv2sf (__a);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vqshl_s64 (int64x1_t __a, int64x1_t __b)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrnd_f64 (float64x1_t __a)
- {
--  return (int64x1_t) {__builtin_aarch64_sqshldi (__a[0], __b[0])};
-+  return vset_lane_f64 (__builtin_trunc (vget_lane_f64 (__a, 0)), __a, 0);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqshl_u8 (uint8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndq_f32 (float32x4_t __a)
- {
--  return __builtin_aarch64_uqshlv8qi_uus ( __a, __b);
-+  return __builtin_aarch64_btruncv4sf (__a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vqshl_u16 (uint16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndq_f64 (float64x2_t __a)
- {
--  return __builtin_aarch64_uqshlv4hi_uus ( __a, __b);
-+  return __builtin_aarch64_btruncv2df (__a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vqshl_u32 (uint32x2_t __a, int32x2_t __b)
-+/* vrnda  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrnda_f32 (float32x2_t __a)
- {
--  return __builtin_aarch64_uqshlv2si_uus ( __a, __b);
-+  return __builtin_aarch64_roundv2sf (__a);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vqshl_u64 (uint64x1_t __a, int64x1_t __b)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrnda_f64 (float64x1_t __a)
- {
--  return (uint64x1_t) {__builtin_aarch64_uqshldi_uus (__a[0], __b[0])};
-+  return vset_lane_f64 (__builtin_round (vget_lane_f64 (__a, 0)), __a, 0);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vqshlq_s8 (int8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndaq_f32 (float32x4_t __a)
- {
--  return __builtin_aarch64_sqshlv16qi (__a, __b);
-+  return __builtin_aarch64_roundv4sf (__a);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vqshlq_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndaq_f64 (float64x2_t __a)
- {
--  return __builtin_aarch64_sqshlv8hi (__a, __b);
-+  return __builtin_aarch64_roundv2df (__a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqshlq_s32 (int32x4_t __a, int32x4_t __b)
-+/* vrndi  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndi_f32 (float32x2_t __a)
- {
--  return __builtin_aarch64_sqshlv4si (__a, __b);
-+  return __builtin_aarch64_nearbyintv2sf (__a);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqshlq_s64 (int64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndi_f64 (float64x1_t __a)
- {
--  return __builtin_aarch64_sqshlv2di (__a, __b);
-+  return vset_lane_f64 (__builtin_nearbyint (vget_lane_f64 (__a, 0)), __a, 0);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vqshlq_u8 (uint8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndiq_f32 (float32x4_t __a)
- {
--  return __builtin_aarch64_uqshlv16qi_uus ( __a, __b);
-+  return __builtin_aarch64_nearbyintv4sf (__a);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vqshlq_u16 (uint16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndiq_f64 (float64x2_t __a)
- {
--  return __builtin_aarch64_uqshlv8hi_uus ( __a, __b);
-+  return __builtin_aarch64_nearbyintv2df (__a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vqshlq_u32 (uint32x4_t __a, int32x4_t __b)
-+/* vrndm  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndm_f32 (float32x2_t __a)
- {
--  return __builtin_aarch64_uqshlv4si_uus ( __a, __b);
-+  return __builtin_aarch64_floorv2sf (__a);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vqshlq_u64 (uint64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndm_f64 (float64x1_t __a)
- {
--  return __builtin_aarch64_uqshlv2di_uus ( __a, __b);
-+  return vset_lane_f64 (__builtin_floor (vget_lane_f64 (__a, 0)), __a, 0);
- }
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqshlb_s8 (int8_t __a, int8_t __b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndmq_f32 (float32x4_t __a)
- {
--  return __builtin_aarch64_sqshlqi (__a, __b);
-+  return __builtin_aarch64_floorv4sf (__a);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqshlh_s16 (int16_t __a, int16_t __b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndmq_f64 (float64x2_t __a)
- {
--  return __builtin_aarch64_sqshlhi (__a, __b);
-+  return __builtin_aarch64_floorv2df (__a);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqshls_s32 (int32_t __a, int32_t __b)
-+/* vrndn  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndn_f32 (float32x2_t __a)
- {
--  return __builtin_aarch64_sqshlsi (__a, __b);
-+  return __builtin_aarch64_frintnv2sf (__a);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vqshld_s64 (int64_t __a, int64_t __b)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndn_f64 (float64x1_t __a)
- {
--  return __builtin_aarch64_sqshldi (__a, __b);
-+  return (float64x1_t) {__builtin_aarch64_frintndf (__a[0])};
- }
- 
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vqshlb_u8 (uint8_t __a, uint8_t __b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndnq_f32 (float32x4_t __a)
- {
--  return __builtin_aarch64_uqshlqi_uus (__a, __b);
-+  return __builtin_aarch64_frintnv4sf (__a);
- }
- 
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vqshlh_u16 (uint16_t __a, uint16_t __b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndnq_f64 (float64x2_t __a)
- {
--  return __builtin_aarch64_uqshlhi_uus (__a, __b);
-+  return __builtin_aarch64_frintnv2df (__a);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vqshls_u32 (uint32_t __a, uint32_t __b)
-+/* vrndp  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndp_f32 (float32x2_t __a)
- {
--  return __builtin_aarch64_uqshlsi_uus (__a, __b);
-+  return __builtin_aarch64_ceilv2sf (__a);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vqshld_u64 (uint64_t __a, uint64_t __b)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndp_f64 (float64x1_t __a)
- {
--  return __builtin_aarch64_uqshldi_uus (__a, __b);
-+  return vset_lane_f64 (__builtin_ceil (vget_lane_f64 (__a, 0)), __a, 0);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vqshl_n_s8 (int8x8_t __a, const int __b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndpq_f32 (float32x4_t __a)
- {
--  return (int8x8_t) __builtin_aarch64_sqshl_nv8qi (__a, __b);
-+  return __builtin_aarch64_ceilv4sf (__a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqshl_n_s16 (int16x4_t __a, const int __b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndpq_f64 (float64x2_t __a)
- {
--  return (int16x4_t) __builtin_aarch64_sqshl_nv4hi (__a, __b);
-+  return __builtin_aarch64_ceilv2df (__a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqshl_n_s32 (int32x2_t __a, const int __b)
-+/* vrndx  */
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndx_f32 (float32x2_t __a)
- {
--  return (int32x2_t) __builtin_aarch64_sqshl_nv2si (__a, __b);
-+  return __builtin_aarch64_rintv2sf (__a);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vqshl_n_s64 (int64x1_t __a, const int __b)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndx_f64 (float64x1_t __a)
- {
--  return (int64x1_t) {__builtin_aarch64_sqshl_ndi (__a[0], __b)};
-+  return vset_lane_f64 (__builtin_rint (vget_lane_f64 (__a, 0)), __a, 0);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqshl_n_u8 (uint8x8_t __a, const int __b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndxq_f32 (float32x4_t __a)
- {
--  return __builtin_aarch64_uqshl_nv8qi_uus (__a, __b);
-+  return __builtin_aarch64_rintv4sf (__a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vqshl_n_u16 (uint16x4_t __a, const int __b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndxq_f64 (float64x2_t __a)
- {
--  return __builtin_aarch64_uqshl_nv4hi_uus (__a, __b);
-+  return __builtin_aarch64_rintv2df (__a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vqshl_n_u32 (uint32x2_t __a, const int __b)
-+/* vrshl */
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshl_s8 (int8x8_t __a, int8x8_t __b)
- {
--  return __builtin_aarch64_uqshl_nv2si_uus (__a, __b);
-+  return (int8x8_t) __builtin_aarch64_srshlv8qi (__a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vqshl_n_u64 (uint64x1_t __a, const int __b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshl_s16 (int16x4_t __a, int16x4_t __b)
- {
--  return (uint64x1_t) {__builtin_aarch64_uqshl_ndi_uus (__a[0], __b)};
-+  return (int16x4_t) __builtin_aarch64_srshlv4hi (__a, __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vqshlq_n_s8 (int8x16_t __a, const int __b)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshl_s32 (int32x2_t __a, int32x2_t __b)
- {
--  return (int8x16_t) __builtin_aarch64_sqshl_nv16qi (__a, __b);
-+  return (int32x2_t) __builtin_aarch64_srshlv2si (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vqshlq_n_s16 (int16x8_t __a, const int __b)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshl_s64 (int64x1_t __a, int64x1_t __b)
- {
--  return (int16x8_t) __builtin_aarch64_sqshl_nv8hi (__a, __b);
-+  return (int64x1_t) {__builtin_aarch64_srshldi (__a[0], __b[0])};
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vqshlq_n_s32 (int32x4_t __a, const int __b)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshl_u8 (uint8x8_t __a, int8x8_t __b)
- {
--  return (int32x4_t) __builtin_aarch64_sqshl_nv4si (__a, __b);
-+  return __builtin_aarch64_urshlv8qi_uus (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vqshlq_n_s64 (int64x2_t __a, const int __b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshl_u16 (uint16x4_t __a, int16x4_t __b)
- {
--  return (int64x2_t) __builtin_aarch64_sqshl_nv2di (__a, __b);
-+  return __builtin_aarch64_urshlv4hi_uus (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vqshlq_n_u8 (uint8x16_t __a, const int __b)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshl_u32 (uint32x2_t __a, int32x2_t __b)
- {
--  return __builtin_aarch64_uqshl_nv16qi_uus (__a, __b);
-+  return __builtin_aarch64_urshlv2si_uus (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vqshlq_n_u16 (uint16x8_t __a, const int __b)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshl_u64 (uint64x1_t __a, int64x1_t __b)
- {
--  return __builtin_aarch64_uqshl_nv8hi_uus (__a, __b);
-+  return (uint64x1_t) {__builtin_aarch64_urshldi_uus (__a[0], __b[0])};
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vqshlq_n_u32 (uint32x4_t __a, const int __b)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshlq_s8 (int8x16_t __a, int8x16_t __b)
- {
--  return __builtin_aarch64_uqshl_nv4si_uus (__a, __b);
-+  return (int8x16_t) __builtin_aarch64_srshlv16qi (__a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vqshlq_n_u64 (uint64x2_t __a, const int __b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshlq_s16 (int16x8_t __a, int16x8_t __b)
- {
--  return __builtin_aarch64_uqshl_nv2di_uus (__a, __b);
-+  return (int16x8_t) __builtin_aarch64_srshlv8hi (__a, __b);
- }
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqshlb_n_s8 (int8_t __a, const int __b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshlq_s32 (int32x4_t __a, int32x4_t __b)
- {
--  return (int8_t) __builtin_aarch64_sqshl_nqi (__a, __b);
-+  return (int32x4_t) __builtin_aarch64_srshlv4si (__a, __b);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqshlh_n_s16 (int16_t __a, const int __b)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshlq_s64 (int64x2_t __a, int64x2_t __b)
- {
--  return (int16_t) __builtin_aarch64_sqshl_nhi (__a, __b);
-+  return (int64x2_t) __builtin_aarch64_srshlv2di (__a, __b);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqshls_n_s32 (int32_t __a, const int __b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshlq_u8 (uint8x16_t __a, int8x16_t __b)
- {
--  return (int32_t) __builtin_aarch64_sqshl_nsi (__a, __b);
-+  return __builtin_aarch64_urshlv16qi_uus (__a, __b);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vqshld_n_s64 (int64_t __a, const int __b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshlq_u16 (uint16x8_t __a, int16x8_t __b)
- {
--  return __builtin_aarch64_sqshl_ndi (__a, __b);
-+  return __builtin_aarch64_urshlv8hi_uus (__a, __b);
- }
- 
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vqshlb_n_u8 (uint8_t __a, const int __b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshlq_u32 (uint32x4_t __a, int32x4_t __b)
- {
--  return __builtin_aarch64_uqshl_nqi_uus (__a, __b);
-+  return __builtin_aarch64_urshlv4si_uus (__a, __b);
- }
- 
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vqshlh_n_u16 (uint16_t __a, const int __b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshlq_u64 (uint64x2_t __a, int64x2_t __b)
- {
--  return __builtin_aarch64_uqshl_nhi_uus (__a, __b);
-+  return __builtin_aarch64_urshlv2di_uus (__a, __b);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vqshls_n_u32 (uint32_t __a, const int __b)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshld_s64 (int64_t __a, int64_t __b)
- {
--  return __builtin_aarch64_uqshl_nsi_uus (__a, __b);
-+  return __builtin_aarch64_srshldi (__a, __b);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vqshld_n_u64 (uint64_t __a, const int __b)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshld_u64 (uint64_t __a, int64_t __b)
- {
--  return __builtin_aarch64_uqshl_ndi_uus (__a, __b);
-+  return __builtin_aarch64_urshldi_uus (__a, __b);
- }
- 
--/* vqshlu */
-+/* vrshr */
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqshlu_n_s8 (int8x8_t __a, const int __b)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshr_n_s8 (int8x8_t __a, const int __b)
- {
--  return __builtin_aarch64_sqshlu_nv8qi_uss (__a, __b);
-+  return (int8x8_t) __builtin_aarch64_srshr_nv8qi (__a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vqshlu_n_s16 (int16x4_t __a, const int __b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshr_n_s16 (int16x4_t __a, const int __b)
- {
--  return __builtin_aarch64_sqshlu_nv4hi_uss (__a, __b);
-+  return (int16x4_t) __builtin_aarch64_srshr_nv4hi (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vqshlu_n_s32 (int32x2_t __a, const int __b)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshr_n_s32 (int32x2_t __a, const int __b)
- {
--  return __builtin_aarch64_sqshlu_nv2si_uss (__a, __b);
-+  return (int32x2_t) __builtin_aarch64_srshr_nv2si (__a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vqshlu_n_s64 (int64x1_t __a, const int __b)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshr_n_s64 (int64x1_t __a, const int __b)
- {
--  return (uint64x1_t) {__builtin_aarch64_sqshlu_ndi_uss (__a[0], __b)};
-+  return (int64x1_t) {__builtin_aarch64_srshr_ndi (__a[0], __b)};
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vqshluq_n_s8 (int8x16_t __a, const int __b)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshr_n_u8 (uint8x8_t __a, const int __b)
- {
--  return __builtin_aarch64_sqshlu_nv16qi_uss (__a, __b);
-+  return __builtin_aarch64_urshr_nv8qi_uus (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vqshluq_n_s16 (int16x8_t __a, const int __b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshr_n_u16 (uint16x4_t __a, const int __b)
- {
--  return __builtin_aarch64_sqshlu_nv8hi_uss (__a, __b);
-+  return __builtin_aarch64_urshr_nv4hi_uus (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vqshluq_n_s32 (int32x4_t __a, const int __b)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshr_n_u32 (uint32x2_t __a, const int __b)
- {
--  return __builtin_aarch64_sqshlu_nv4si_uss (__a, __b);
-+  return __builtin_aarch64_urshr_nv2si_uus (__a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vqshluq_n_s64 (int64x2_t __a, const int __b)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshr_n_u64 (uint64x1_t __a, const int __b)
- {
--  return __builtin_aarch64_sqshlu_nv2di_uss (__a, __b);
-+  return (uint64x1_t) {__builtin_aarch64_urshr_ndi_uus (__a[0], __b)};
- }
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqshlub_n_s8 (int8_t __a, const int __b)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshrq_n_s8 (int8x16_t __a, const int __b)
- {
--  return (int8_t) __builtin_aarch64_sqshlu_nqi_uss (__a, __b);
-+  return (int8x16_t) __builtin_aarch64_srshr_nv16qi (__a, __b);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqshluh_n_s16 (int16_t __a, const int __b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshrq_n_s16 (int16x8_t __a, const int __b)
- {
--  return (int16_t) __builtin_aarch64_sqshlu_nhi_uss (__a, __b);
-+  return (int16x8_t) __builtin_aarch64_srshr_nv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqshlus_n_s32 (int32_t __a, const int __b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshrq_n_s32 (int32x4_t __a, const int __b)
- {
--  return (int32_t) __builtin_aarch64_sqshlu_nsi_uss (__a, __b);
-+  return (int32x4_t) __builtin_aarch64_srshr_nv4si (__a, __b);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vqshlud_n_s64 (int64_t __a, const int __b)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshrq_n_s64 (int64x2_t __a, const int __b)
- {
--  return __builtin_aarch64_sqshlu_ndi_uss (__a, __b);
-+  return (int64x2_t) __builtin_aarch64_srshr_nv2di (__a, __b);
- }
- 
--/* vqshrn */
--
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vqshrn_n_s16 (int16x8_t __a, const int __b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshrq_n_u8 (uint8x16_t __a, const int __b)
- {
--  return (int8x8_t) __builtin_aarch64_sqshrn_nv8hi (__a, __b);
-+  return __builtin_aarch64_urshr_nv16qi_uus (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vqshrn_n_s32 (int32x4_t __a, const int __b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshrq_n_u16 (uint16x8_t __a, const int __b)
- {
--  return (int16x4_t) __builtin_aarch64_sqshrn_nv4si (__a, __b);
-+  return __builtin_aarch64_urshr_nv8hi_uus (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vqshrn_n_s64 (int64x2_t __a, const int __b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshrq_n_u32 (uint32x4_t __a, const int __b)
- {
--  return (int32x2_t) __builtin_aarch64_sqshrn_nv2di (__a, __b);
-+  return __builtin_aarch64_urshr_nv4si_uus (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqshrn_n_u16 (uint16x8_t __a, const int __b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshrq_n_u64 (uint64x2_t __a, const int __b)
- {
--  return __builtin_aarch64_uqshrn_nv8hi_uus ( __a, __b);
-+  return __builtin_aarch64_urshr_nv2di_uus (__a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vqshrn_n_u32 (uint32x4_t __a, const int __b)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshrd_n_s64 (int64_t __a, const int __b)
- {
--  return __builtin_aarch64_uqshrn_nv4si_uus ( __a, __b);
-+  return __builtin_aarch64_srshr_ndi (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vqshrn_n_u64 (uint64x2_t __a, const int __b)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrshrd_n_u64 (uint64_t __a, const int __b)
- {
--  return __builtin_aarch64_uqshrn_nv2di_uus ( __a, __b);
-+  return __builtin_aarch64_urshr_ndi_uus (__a, __b);
- }
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqshrnh_n_s16 (int16_t __a, const int __b)
-+/* vrsqrte.  */
-+
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsqrtes_f32 (float32_t __a)
- {
--  return (int8_t) __builtin_aarch64_sqshrn_nhi (__a, __b);
-+  return __builtin_aarch64_rsqrtesf (__a);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqshrns_n_s32 (int32_t __a, const int __b)
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsqrted_f64 (float64_t __a)
- {
--  return (int16_t) __builtin_aarch64_sqshrn_nsi (__a, __b);
-+  return __builtin_aarch64_rsqrtedf (__a);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqshrnd_n_s64 (int64_t __a, const int __b)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsqrte_f32 (float32x2_t __a)
- {
--  return (int32_t) __builtin_aarch64_sqshrn_ndi (__a, __b);
-+  return __builtin_aarch64_rsqrtev2sf (__a);
- }
- 
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vqshrnh_n_u16 (uint16_t __a, const int __b)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsqrte_f64 (float64x1_t __a)
- {
--  return __builtin_aarch64_uqshrn_nhi_uus (__a, __b);
-+  return (float64x1_t) {vrsqrted_f64 (vget_lane_f64 (__a, 0))};
- }
- 
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vqshrns_n_u32 (uint32_t __a, const int __b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsqrteq_f32 (float32x4_t __a)
- {
--  return __builtin_aarch64_uqshrn_nsi_uus (__a, __b);
-+  return __builtin_aarch64_rsqrtev4sf (__a);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vqshrnd_n_u64 (uint64_t __a, const int __b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsqrteq_f64 (float64x2_t __a)
- {
--  return __builtin_aarch64_uqshrn_ndi_uus (__a, __b);
-+  return __builtin_aarch64_rsqrtev2df (__a);
- }
- 
--/* vqshrun */
-+/* vrsqrts.  */
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqshrun_n_s16 (int16x8_t __a, const int __b)
-+__extension__ extern __inline float32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsqrtss_f32 (float32_t __a, float32_t __b)
- {
--  return (uint8x8_t) __builtin_aarch64_sqshrun_nv8hi (__a, __b);
-+  return __builtin_aarch64_rsqrtssf (__a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vqshrun_n_s32 (int32x4_t __a, const int __b)
-+__extension__ extern __inline float64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsqrtsd_f64 (float64_t __a, float64_t __b)
- {
--  return (uint16x4_t) __builtin_aarch64_sqshrun_nv4si (__a, __b);
-+  return __builtin_aarch64_rsqrtsdf (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vqshrun_n_s64 (int64x2_t __a, const int __b)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsqrts_f32 (float32x2_t __a, float32x2_t __b)
- {
--  return (uint32x2_t) __builtin_aarch64_sqshrun_nv2di (__a, __b);
-+  return __builtin_aarch64_rsqrtsv2sf (__a, __b);
- }
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqshrunh_n_s16 (int16_t __a, const int __b)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsqrts_f64 (float64x1_t __a, float64x1_t __b)
- {
--  return (int8_t) __builtin_aarch64_sqshrun_nhi (__a, __b);
-+  return (float64x1_t) {vrsqrtsd_f64 (vget_lane_f64 (__a, 0),
-+				      vget_lane_f64 (__b, 0))};
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqshruns_n_s32 (int32_t __a, const int __b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsqrtsq_f32 (float32x4_t __a, float32x4_t __b)
- {
--  return (int16_t) __builtin_aarch64_sqshrun_nsi (__a, __b);
-+  return __builtin_aarch64_rsqrtsv4sf (__a, __b);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqshrund_n_s64 (int64_t __a, const int __b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsqrtsq_f64 (float64x2_t __a, float64x2_t __b)
- {
--  return (int32_t) __builtin_aarch64_sqshrun_ndi (__a, __b);
-+  return __builtin_aarch64_rsqrtsv2df (__a, __b);
- }
- 
--/* vqsub */
-+/* vrsra */
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vqsubb_s8 (int8_t __a, int8_t __b)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
- {
--  return (int8_t) __builtin_aarch64_sqsubqi (__a, __b);
-+  return (int8x8_t) __builtin_aarch64_srsra_nv8qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vqsubh_s16 (int16_t __a, int16_t __b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
- {
--  return (int16_t) __builtin_aarch64_sqsubhi (__a, __b);
-+  return (int16x4_t) __builtin_aarch64_srsra_nv4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vqsubs_s32 (int32_t __a, int32_t __b)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
- {
--  return (int32_t) __builtin_aarch64_sqsubsi (__a, __b);
-+  return (int32x2_t) __builtin_aarch64_srsra_nv2si (__a, __b, __c);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vqsubd_s64 (int64_t __a, int64_t __b)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
- {
--  return __builtin_aarch64_sqsubdi (__a, __b);
-+  return (int64x1_t) {__builtin_aarch64_srsra_ndi (__a[0], __b[0], __c)};
- }
- 
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vqsubb_u8 (uint8_t __a, uint8_t __b)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
- {
--  return (uint8_t) __builtin_aarch64_uqsubqi_uuu (__a, __b);
-+  return __builtin_aarch64_ursra_nv8qi_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vqsubh_u16 (uint16_t __a, uint16_t __b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
- {
--  return (uint16_t) __builtin_aarch64_uqsubhi_uuu (__a, __b);
-+  return __builtin_aarch64_ursra_nv4hi_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vqsubs_u32 (uint32_t __a, uint32_t __b)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
- {
--  return (uint32_t) __builtin_aarch64_uqsubsi_uuu (__a, __b);
-+  return __builtin_aarch64_ursra_nv2si_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vqsubd_u64 (uint64_t __a, uint64_t __b)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
- {
--  return __builtin_aarch64_uqsubdi_uuu (__a, __b);
-+  return (uint64x1_t) {__builtin_aarch64_ursra_ndi_uuus (__a[0], __b[0], __c)};
- }
- 
--/* vqtbl2 */
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
-+{
-+  return (int8x16_t) __builtin_aarch64_srsra_nv16qi (__a, __b, __c);
-+}
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vqtbl2_s8 (int8x16x2_t tab, uint8x8_t idx)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
--  return __builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
-+  return (int16x8_t) __builtin_aarch64_srsra_nv8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqtbl2_u8 (uint8x16x2_t tab, uint8x8_t idx)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
--  return (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
-+  return (int32x4_t) __builtin_aarch64_srsra_nv4si (__a, __b, __c);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vqtbl2_p8 (poly8x16x2_t tab, uint8x8_t idx)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
--  return (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
-+  return (int64x2_t) __builtin_aarch64_srsra_nv2di (__a, __b, __c);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vqtbl2q_s8 (int8x16x2_t tab, uint8x16_t idx)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
--  return __builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
-+  return __builtin_aarch64_ursra_nv16qi_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vqtbl2q_u8 (uint8x16x2_t tab, uint8x16_t idx)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
--  return (uint8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
-+  return __builtin_aarch64_ursra_nv8hi_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vqtbl2q_p8 (poly8x16x2_t tab, uint8x16_t idx)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
--  return (poly8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
-+  return __builtin_aarch64_ursra_nv4si_uuus (__a, __b, __c);
- }
- 
--/* vqtbl3 */
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
-+{
-+  return __builtin_aarch64_ursra_nv2di_uuus (__a, __b, __c);
-+}
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vqtbl3_s8 (int8x16x3_t tab, uint8x8_t idx)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsrad_n_s64 (int64_t __a, int64_t __b, const int __c)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
--  return __builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
-+  return __builtin_aarch64_srsra_ndi (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqtbl3_u8 (uint8x16x3_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsrad_n_u64 (uint64_t __a, uint64_t __b, const int __c)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
--  return (uint8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
-+  return __builtin_aarch64_ursra_ndi_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vqtbl3_p8 (poly8x16x3_t tab, uint8x8_t idx)
-+#pragma GCC push_options
-+#pragma GCC target ("+nothing+crypto")
-+
-+/* vsha1  */
-+
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsha1cq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
--  return (poly8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
-+  return __builtin_aarch64_crypto_sha1cv4si_uuuu (hash_abcd, hash_e, wk);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vqtbl3q_s8 (int8x16x3_t tab, uint8x16_t idx)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsha1mq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
--  return __builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
-+  return __builtin_aarch64_crypto_sha1mv4si_uuuu (hash_abcd, hash_e, wk);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vqtbl3q_u8 (uint8x16x3_t tab, uint8x16_t idx)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsha1pq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
--  return (uint8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
-+  return __builtin_aarch64_crypto_sha1pv4si_uuuu (hash_abcd, hash_e, wk);
-+}
-+
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsha1h_u32 (uint32_t hash_e)
-+{
-+  return __builtin_aarch64_crypto_sha1hsi_uu (hash_e);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vqtbl3q_p8 (poly8x16x3_t tab, uint8x16_t idx)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsha1su0q_u32 (uint32x4_t w0_3, uint32x4_t w4_7, uint32x4_t w8_11)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
--  return (poly8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
-+  return __builtin_aarch64_crypto_sha1su0v4si_uuuu (w0_3, w4_7, w8_11);
- }
- 
--/* vqtbl4 */
--
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vqtbl4_s8 (int8x16x4_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsha1su1q_u32 (uint32x4_t tw0_3, uint32x4_t w12_15)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
--  return __builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx);
-+  return __builtin_aarch64_crypto_sha1su1v4si_uuu (tw0_3, w12_15);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqtbl4_u8 (uint8x16x4_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsha256hq_u32 (uint32x4_t hash_abcd, uint32x4_t hash_efgh, uint32x4_t wk)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
--  return (uint8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx);
-+  return __builtin_aarch64_crypto_sha256hv4si_uuuu (hash_abcd, hash_efgh, wk);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vqtbl4_p8 (poly8x16x4_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsha256h2q_u32 (uint32x4_t hash_efgh, uint32x4_t hash_abcd, uint32x4_t wk)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
--  return (poly8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx);
-+  return __builtin_aarch64_crypto_sha256h2v4si_uuuu (hash_efgh, hash_abcd, wk);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vqtbl4q_s8 (int8x16x4_t tab, uint8x16_t idx)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsha256su0q_u32 (uint32x4_t w0_3, uint32x4_t w4_7)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
--  return __builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx);
-+  return __builtin_aarch64_crypto_sha256su0v4si_uuu (w0_3, w4_7);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vqtbl4q_u8 (uint8x16x4_t tab, uint8x16_t idx)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsha256su1q_u32 (uint32x4_t tw0_3, uint32x4_t w8_11, uint32x4_t w12_15)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
--  return (uint8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx);
-+  return __builtin_aarch64_crypto_sha256su1v4si_uuuu (tw0_3, w8_11, w12_15);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vqtbl4q_p8 (poly8x16x4_t tab, uint8x16_t idx)
-+__extension__ extern __inline poly128_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_p64 (poly64_t a, poly64_t b)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
--  return (poly8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx);
-+  return
-+    __builtin_aarch64_crypto_pmulldi_ppp (a, b);
- }
- 
--
--/* vqtbx2 */
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vqtbx2_s8 (int8x8_t r, int8x16x2_t tab, uint8x8_t idx)
-+__extension__ extern __inline poly128_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmull_high_p64 (poly64x2_t a, poly64x2_t b)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
--  return __builtin_aarch64_tbx4v8qi (r, __o, (int8x8_t)idx);
-+  return __builtin_aarch64_crypto_pmullv2di_ppp (a, b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqtbx2_u8 (uint8x8_t r, uint8x16x2_t tab, uint8x8_t idx)
-+#pragma GCC pop_options
-+
-+/* vshl */
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_n_s8 (int8x8_t __a, const int __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
--  return (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)r, __o,
--						(int8x8_t)idx);
-+  return (int8x8_t) __builtin_aarch64_ashlv8qi (__a, __b);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vqtbx2_p8 (poly8x8_t r, poly8x16x2_t tab, uint8x8_t idx)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_n_s16 (int16x4_t __a, const int __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
--  return (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)r, __o,
--						(int8x8_t)idx);
-+  return (int16x4_t) __builtin_aarch64_ashlv4hi (__a, __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vqtbx2q_s8 (int8x16_t r, int8x16x2_t tab, uint8x16_t idx)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_n_s32 (int32x2_t __a, const int __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
--  return __builtin_aarch64_tbx4v16qi (r, __o, (int8x16_t)idx);
-+  return (int32x2_t) __builtin_aarch64_ashlv2si (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vqtbx2q_u8 (uint8x16_t r, uint8x16x2_t tab, uint8x16_t idx)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_n_s64 (int64x1_t __a, const int __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
--  return (uint8x16_t)__builtin_aarch64_tbx4v16qi ((int8x16_t)r, __o,
--						  (int8x16_t)idx);
-+  return (int64x1_t) {__builtin_aarch64_ashldi (__a[0], __b)};
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vqtbx2q_p8 (poly8x16_t r, poly8x16x2_t tab, uint8x16_t idx)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_n_u8 (uint8x8_t __a, const int __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
--  return (poly8x16_t)__builtin_aarch64_tbx4v16qi ((int8x16_t)r, __o,
--						  (int8x16_t)idx);
-+  return (uint8x8_t) __builtin_aarch64_ashlv8qi ((int8x8_t) __a, __b);
- }
- 
--/* vqtbx3 */
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vqtbx3_s8 (int8x8_t r, int8x16x3_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_n_u16 (uint16x4_t __a, const int __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[2], 2);
--  return __builtin_aarch64_qtbx3v8qi (r, __o, (int8x8_t)idx);
-+  return (uint16x4_t) __builtin_aarch64_ashlv4hi ((int16x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqtbx3_u8 (uint8x8_t r, uint8x16x3_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_n_u32 (uint32x2_t __a, const int __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
--  return (uint8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)r, __o,
--						 (int8x8_t)idx);
-+  return (uint32x2_t) __builtin_aarch64_ashlv2si ((int32x2_t) __a, __b);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vqtbx3_p8 (poly8x8_t r, poly8x16x3_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_n_u64 (uint64x1_t __a, const int __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
--  return (poly8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)r, __o,
--						 (int8x8_t)idx);
-+  return (uint64x1_t) {__builtin_aarch64_ashldi ((int64_t) __a[0], __b)};
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vqtbx3q_s8 (int8x16_t r, int8x16x3_t tab, uint8x16_t idx)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_n_s8 (int8x16_t __a, const int __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[2], 2);
--  return __builtin_aarch64_qtbx3v16qi (r, __o, (int8x16_t)idx);
-+  return (int8x16_t) __builtin_aarch64_ashlv16qi (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vqtbx3q_u8 (uint8x16_t r, uint8x16x3_t tab, uint8x16_t idx)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_n_s16 (int16x8_t __a, const int __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
--  return (uint8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)r, __o,
--						   (int8x16_t)idx);
-+  return (int16x8_t) __builtin_aarch64_ashlv8hi (__a, __b);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vqtbx3q_p8 (poly8x16_t r, poly8x16x3_t tab, uint8x16_t idx)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_n_s32 (int32x4_t __a, const int __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
--  return (poly8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)r, __o,
--						   (int8x16_t)idx);
-+  return (int32x4_t) __builtin_aarch64_ashlv4si (__a, __b);
- }
- 
--/* vqtbx4 */
--
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vqtbx4_s8 (int8x8_t r, int8x16x4_t tab, uint8x8_t idx)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_n_s64 (int64x2_t __a, const int __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[3], 3);
--  return __builtin_aarch64_qtbx4v8qi (r, __o, (int8x8_t)idx);
-+  return (int64x2_t) __builtin_aarch64_ashlv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vqtbx4_u8 (uint8x8_t r, uint8x16x4_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_n_u8 (uint8x16_t __a, const int __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
--  return (uint8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)r, __o,
--						 (int8x8_t)idx);
-+  return (uint8x16_t) __builtin_aarch64_ashlv16qi ((int8x16_t) __a, __b);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vqtbx4_p8 (poly8x8_t r, poly8x16x4_t tab, uint8x8_t idx)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_n_u16 (uint16x8_t __a, const int __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
--  return (poly8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)r, __o,
--						 (int8x8_t)idx);
-+  return (uint16x8_t) __builtin_aarch64_ashlv8hi ((int16x8_t) __a, __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vqtbx4q_s8 (int8x16_t r, int8x16x4_t tab, uint8x16_t idx)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_n_u32 (uint32x4_t __a, const int __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[3], 3);
--  return __builtin_aarch64_qtbx4v16qi (r, __o, (int8x16_t)idx);
-+  return (uint32x4_t) __builtin_aarch64_ashlv4si ((int32x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vqtbx4q_u8 (uint8x16_t r, uint8x16x4_t tab, uint8x16_t idx)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_n_u64 (uint64x2_t __a, const int __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
--  return (uint8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)r, __o,
--						   (int8x16_t)idx);
-+  return (uint64x2_t) __builtin_aarch64_ashlv2di ((int64x2_t) __a, __b);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vqtbx4q_p8 (poly8x16_t r, poly8x16x4_t tab, uint8x16_t idx)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshld_n_s64 (int64_t __a, const int __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
--  return (poly8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)r, __o,
--						   (int8x16_t)idx);
-+  return __builtin_aarch64_ashldi (__a, __b);
- }
- 
--/* vrbit  */
--
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vrbit_p8 (poly8x8_t __a)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshld_n_u64 (uint64_t __a, const int __b)
- {
--  return (poly8x8_t) __builtin_aarch64_rbitv8qi ((int8x8_t) __a);
-+  return (uint64_t) __builtin_aarch64_ashldi (__a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vrbit_s8 (int8x8_t __a)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_s8 (int8x8_t __a, int8x8_t __b)
- {
--  return __builtin_aarch64_rbitv8qi (__a);
-+  return __builtin_aarch64_sshlv8qi (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vrbit_u8 (uint8x8_t __a)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_s16 (int16x4_t __a, int16x4_t __b)
- {
--  return (uint8x8_t) __builtin_aarch64_rbitv8qi ((int8x8_t) __a);
-+  return __builtin_aarch64_sshlv4hi (__a, __b);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vrbitq_p8 (poly8x16_t __a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_s32 (int32x2_t __a, int32x2_t __b)
- {
--  return (poly8x16_t) __builtin_aarch64_rbitv16qi ((int8x16_t)__a);
-+  return __builtin_aarch64_sshlv2si (__a, __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vrbitq_s8 (int8x16_t __a)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_s64 (int64x1_t __a, int64x1_t __b)
- {
--  return __builtin_aarch64_rbitv16qi (__a);
-+  return (int64x1_t) {__builtin_aarch64_sshldi (__a[0], __b[0])};
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vrbitq_u8 (uint8x16_t __a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_u8 (uint8x8_t __a, int8x8_t __b)
- {
--  return (uint8x16_t) __builtin_aarch64_rbitv16qi ((int8x16_t) __a);
-+  return __builtin_aarch64_ushlv8qi_uus (__a, __b);
- }
- 
--/* vrecpe  */
--
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vrecpe_u32 (uint32x2_t __a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_u16 (uint16x4_t __a, int16x4_t __b)
- {
--  return (uint32x2_t) __builtin_aarch64_urecpev2si ((int32x2_t) __a);
-+  return __builtin_aarch64_ushlv4hi_uus (__a, __b);
- }
-- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vrecpeq_u32 (uint32x4_t __a)
-+
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_u32 (uint32x2_t __a, int32x2_t __b)
- {
--  return (uint32x4_t) __builtin_aarch64_urecpev4si ((int32x4_t) __a);
-+  return __builtin_aarch64_ushlv2si_uus (__a, __b);
- }
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vrecpes_f32 (float32_t __a)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshl_u64 (uint64x1_t __a, int64x1_t __b)
- {
--  return __builtin_aarch64_frecpesf (__a);
-+  return (uint64x1_t) {__builtin_aarch64_ushldi_uus (__a[0], __b[0])};
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vrecped_f64 (float64_t __a)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_s8 (int8x16_t __a, int8x16_t __b)
- {
--  return __builtin_aarch64_frecpedf (__a);
-+  return __builtin_aarch64_sshlv16qi (__a, __b);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vrecpe_f32 (float32x2_t __a)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_s16 (int16x8_t __a, int16x8_t __b)
- {
--  return __builtin_aarch64_frecpev2sf (__a);
-+  return __builtin_aarch64_sshlv8hi (__a, __b);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vrecpeq_f32 (float32x4_t __a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_s32 (int32x4_t __a, int32x4_t __b)
- {
--  return __builtin_aarch64_frecpev4sf (__a);
-+  return __builtin_aarch64_sshlv4si (__a, __b);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vrecpeq_f64 (float64x2_t __a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_s64 (int64x2_t __a, int64x2_t __b)
- {
--  return __builtin_aarch64_frecpev2df (__a);
-+  return __builtin_aarch64_sshlv2di (__a, __b);
- }
- 
--/* vrecps  */
--
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vrecpss_f32 (float32_t __a, float32_t __b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_u8 (uint8x16_t __a, int8x16_t __b)
- {
--  return __builtin_aarch64_frecpssf (__a, __b);
-+  return __builtin_aarch64_ushlv16qi_uus (__a, __b);
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vrecpsd_f64 (float64_t __a, float64_t __b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_u16 (uint16x8_t __a, int16x8_t __b)
- {
--  return __builtin_aarch64_frecpsdf (__a, __b);
-+  return __builtin_aarch64_ushlv8hi_uus (__a, __b);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vrecps_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_u32 (uint32x4_t __a, int32x4_t __b)
- {
--  return __builtin_aarch64_frecpsv2sf (__a, __b);
-+  return __builtin_aarch64_ushlv4si_uus (__a, __b);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vrecpsq_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshlq_u64 (uint64x2_t __a, int64x2_t __b)
- {
--  return __builtin_aarch64_frecpsv4sf (__a, __b);
-+  return __builtin_aarch64_ushlv2di_uus (__a, __b);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vrecpsq_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshld_s64 (int64_t __a, int64_t __b)
- {
--  return __builtin_aarch64_frecpsv2df (__a, __b);
-+  return __builtin_aarch64_sshldi (__a, __b);
- }
- 
--/* vrecpx  */
--
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
--vrecpxs_f32 (float32_t __a)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshld_u64 (uint64_t __a, uint64_t __b)
- {
--  return __builtin_aarch64_frecpxsf (__a);
-+  return __builtin_aarch64_ushldi_uus (__a, __b);
- }
- 
--__extension__ static __inline float64_t __attribute__ ((__always_inline__))
--vrecpxd_f64 (float64_t __a)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshll_high_n_s8 (int8x16_t __a, const int __b)
- {
--  return __builtin_aarch64_frecpxdf (__a);
-+  return __builtin_aarch64_sshll2_nv16qi (__a, __b);
- }
- 
--
--/* vrev  */
--
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vrev16_p8 (poly8x8_t a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshll_high_n_s16 (int16x8_t __a, const int __b)
- {
--  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
-+  return __builtin_aarch64_sshll2_nv8hi (__a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vrev16_s8 (int8x8_t a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshll_high_n_s32 (int32x4_t __a, const int __b)
- {
--  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
-+  return __builtin_aarch64_sshll2_nv4si (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vrev16_u8 (uint8x8_t a)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshll_high_n_u8 (uint8x16_t __a, const int __b)
- {
--  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
-+  return (uint16x8_t) __builtin_aarch64_ushll2_nv16qi ((int8x16_t) __a, __b);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vrev16q_p8 (poly8x16_t a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshll_high_n_u16 (uint16x8_t __a, const int __b)
- {
--  return __builtin_shuffle (a,
--      (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
-+  return (uint32x4_t) __builtin_aarch64_ushll2_nv8hi ((int16x8_t) __a, __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vrev16q_s8 (int8x16_t a)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshll_high_n_u32 (uint32x4_t __a, const int __b)
- {
--  return __builtin_shuffle (a,
--      (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
-+  return (uint64x2_t) __builtin_aarch64_ushll2_nv4si ((int32x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vrev16q_u8 (uint8x16_t a)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshll_n_s8 (int8x8_t __a, const int __b)
- {
--  return __builtin_shuffle (a,
--      (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
-+  return __builtin_aarch64_sshll_nv8qi (__a, __b);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vrev32_p8 (poly8x8_t a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshll_n_s16 (int16x4_t __a, const int __b)
- {
--  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
-+  return __builtin_aarch64_sshll_nv4hi (__a, __b);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
--vrev32_p16 (poly16x4_t a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshll_n_s32 (int32x2_t __a, const int __b)
- {
--  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
-+  return __builtin_aarch64_sshll_nv2si (__a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vrev32_s8 (int8x8_t a)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshll_n_u8 (uint8x8_t __a, const int __b)
- {
--  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
-+  return __builtin_aarch64_ushll_nv8qi_uus (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vrev32_s16 (int16x4_t a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshll_n_u16 (uint16x4_t __a, const int __b)
- {
--  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
-+  return __builtin_aarch64_ushll_nv4hi_uus (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vrev32_u8 (uint8x8_t a)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshll_n_u32 (uint32x2_t __a, const int __b)
- {
--  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
-+  return __builtin_aarch64_ushll_nv2si_uus (__a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vrev32_u16 (uint16x4_t a)
-+/* vshr */
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshr_n_s8 (int8x8_t __a, const int __b)
- {
--  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
-+  return (int8x8_t) __builtin_aarch64_ashrv8qi (__a, __b);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vrev32q_p8 (poly8x16_t a)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshr_n_s16 (int16x4_t __a, const int __b)
- {
--  return __builtin_shuffle (a,
--      (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
-+  return (int16x4_t) __builtin_aarch64_ashrv4hi (__a, __b);
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vrev32q_p16 (poly16x8_t a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshr_n_s32 (int32x2_t __a, const int __b)
- {
--  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
-+  return (int32x2_t) __builtin_aarch64_ashrv2si (__a, __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vrev32q_s8 (int8x16_t a)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshr_n_s64 (int64x1_t __a, const int __b)
- {
--  return __builtin_shuffle (a,
--      (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
-+  return (int64x1_t) {__builtin_aarch64_ashr_simddi (__a[0], __b)};
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vrev32q_s16 (int16x8_t a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshr_n_u8 (uint8x8_t __a, const int __b)
- {
--  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
-+  return (uint8x8_t) __builtin_aarch64_lshrv8qi ((int8x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vrev32q_u8 (uint8x16_t a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshr_n_u16 (uint16x4_t __a, const int __b)
- {
--  return __builtin_shuffle (a,
--      (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
-+  return (uint16x4_t) __builtin_aarch64_lshrv4hi ((int16x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vrev32q_u16 (uint16x8_t a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshr_n_u32 (uint32x2_t __a, const int __b)
- {
--  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
-+  return (uint32x2_t) __builtin_aarch64_lshrv2si ((int32x2_t) __a, __b);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vrev64_f32 (float32x2_t a)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshr_n_u64 (uint64x1_t __a, const int __b)
- {
--  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
-+  return (uint64x1_t) {__builtin_aarch64_lshr_simddi_uus ( __a[0], __b)};
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vrev64_p8 (poly8x8_t a)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshrq_n_s8 (int8x16_t __a, const int __b)
- {
--  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
-+  return (int8x16_t) __builtin_aarch64_ashrv16qi (__a, __b);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
--vrev64_p16 (poly16x4_t a)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshrq_n_s16 (int16x8_t __a, const int __b)
- {
--  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
-+  return (int16x8_t) __builtin_aarch64_ashrv8hi (__a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vrev64_s8 (int8x8_t a)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshrq_n_s32 (int32x4_t __a, const int __b)
- {
--  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
-+  return (int32x4_t) __builtin_aarch64_ashrv4si (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vrev64_s16 (int16x4_t a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshrq_n_s64 (int64x2_t __a, const int __b)
- {
--  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
-+  return (int64x2_t) __builtin_aarch64_ashrv2di (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vrev64_s32 (int32x2_t a)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshrq_n_u8 (uint8x16_t __a, const int __b)
- {
--  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
-+  return (uint8x16_t) __builtin_aarch64_lshrv16qi ((int8x16_t) __a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vrev64_u8 (uint8x8_t a)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshrq_n_u16 (uint16x8_t __a, const int __b)
- {
--  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
-+  return (uint16x8_t) __builtin_aarch64_lshrv8hi ((int16x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vrev64_u16 (uint16x4_t a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshrq_n_u32 (uint32x4_t __a, const int __b)
- {
--  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
-+  return (uint32x4_t) __builtin_aarch64_lshrv4si ((int32x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vrev64_u32 (uint32x2_t a)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshrq_n_u64 (uint64x2_t __a, const int __b)
- {
--  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
-+  return (uint64x2_t) __builtin_aarch64_lshrv2di ((int64x2_t) __a, __b);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vrev64q_f32 (float32x4_t a)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshrd_n_s64 (int64_t __a, const int __b)
- {
--  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
-+  return __builtin_aarch64_ashr_simddi (__a, __b);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vrev64q_p8 (poly8x16_t a)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vshrd_n_u64 (uint64_t __a, const int __b)
- {
--  return __builtin_shuffle (a,
--      (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
-+  return __builtin_aarch64_lshr_simddi_uus (__a, __b);
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vrev64q_p16 (poly16x8_t a)
-+/* vsli */
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsli_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
- {
--  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
-+  return (int8x8_t) __builtin_aarch64_ssli_nv8qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vrev64q_s8 (int8x16_t a)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsli_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
- {
--  return __builtin_shuffle (a,
--      (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
-+  return (int16x4_t) __builtin_aarch64_ssli_nv4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vrev64q_s16 (int16x8_t a)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsli_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
- {
--  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
-+  return (int32x2_t) __builtin_aarch64_ssli_nv2si (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vrev64q_s32 (int32x4_t a)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsli_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
- {
--  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
-+  return (int64x1_t) {__builtin_aarch64_ssli_ndi (__a[0], __b[0], __c)};
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vrev64q_u8 (uint8x16_t a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsli_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
- {
--  return __builtin_shuffle (a,
--      (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
-+  return __builtin_aarch64_usli_nv8qi_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vrev64q_u16 (uint16x8_t a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsli_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
- {
--  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
-+  return __builtin_aarch64_usli_nv4hi_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vrev64q_u32 (uint32x4_t a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsli_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
- {
--  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
-+  return __builtin_aarch64_usli_nv2si_uuus (__a, __b, __c);
- }
- 
--/* vrnd  */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vrnd_f32 (float32x2_t __a)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsli_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
- {
--  return __builtin_aarch64_btruncv2sf (__a);
-+  return (uint64x1_t) {__builtin_aarch64_usli_ndi_uuus (__a[0], __b[0], __c)};
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vrnd_f64 (float64x1_t __a)
-+__extension__ extern __inline poly64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsli_n_p64 (poly64x1_t __a, poly64x1_t __b, const int __c)
- {
--  return vset_lane_f64 (__builtin_trunc (vget_lane_f64 (__a, 0)), __a, 0);
-+  return (poly64x1_t) {__builtin_aarch64_ssli_ndi_ppps (__a[0], __b[0], __c)};
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vrndq_f32 (float32x4_t __a)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsliq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
- {
--  return __builtin_aarch64_btruncv4sf (__a);
-+  return (int8x16_t) __builtin_aarch64_ssli_nv16qi (__a, __b, __c);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vrndq_f64 (float64x2_t __a)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsliq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
- {
--  return __builtin_aarch64_btruncv2df (__a);
-+  return (int16x8_t) __builtin_aarch64_ssli_nv8hi (__a, __b, __c);
- }
- 
--/* vrnda  */
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsliq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
-+{
-+  return (int32x4_t) __builtin_aarch64_ssli_nv4si (__a, __b, __c);
-+}
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vrnda_f32 (float32x2_t __a)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsliq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
- {
--  return __builtin_aarch64_roundv2sf (__a);
-+  return (int64x2_t) __builtin_aarch64_ssli_nv2di (__a, __b, __c);
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vrnda_f64 (float64x1_t __a)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsliq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
- {
--  return vset_lane_f64 (__builtin_round (vget_lane_f64 (__a, 0)), __a, 0);
-+  return __builtin_aarch64_usli_nv16qi_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vrndaq_f32 (float32x4_t __a)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsliq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
- {
--  return __builtin_aarch64_roundv4sf (__a);
-+  return __builtin_aarch64_usli_nv8hi_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vrndaq_f64 (float64x2_t __a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsliq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
- {
--  return __builtin_aarch64_roundv2df (__a);
-+  return __builtin_aarch64_usli_nv4si_uuus (__a, __b, __c);
- }
- 
--/* vrndi  */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vrndi_f32 (float32x2_t __a)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsliq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
- {
--  return __builtin_aarch64_nearbyintv2sf (__a);
-+  return __builtin_aarch64_usli_nv2di_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vrndi_f64 (float64x1_t __a)
-+__extension__ extern __inline poly64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsliq_n_p64 (poly64x2_t __a, poly64x2_t __b, const int __c)
- {
--  return vset_lane_f64 (__builtin_nearbyint (vget_lane_f64 (__a, 0)), __a, 0);
-+  return __builtin_aarch64_ssli_nv2di_ppps (__a, __b, __c);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vrndiq_f32 (float32x4_t __a)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vslid_n_s64 (int64_t __a, int64_t __b, const int __c)
- {
--  return __builtin_aarch64_nearbyintv4sf (__a);
-+  return __builtin_aarch64_ssli_ndi (__a, __b, __c);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vrndiq_f64 (float64x2_t __a)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vslid_n_u64 (uint64_t __a, uint64_t __b, const int __c)
- {
--  return __builtin_aarch64_nearbyintv2df (__a);
-+  return __builtin_aarch64_usli_ndi_uuus (__a, __b, __c);
- }
- 
--/* vrndm  */
-+/* vsqadd */
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vrndm_f32 (float32x2_t __a)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqadd_u8 (uint8x8_t __a, int8x8_t __b)
- {
--  return __builtin_aarch64_floorv2sf (__a);
-+  return __builtin_aarch64_usqaddv8qi_uus (__a, __b);
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vrndm_f64 (float64x1_t __a)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqadd_u16 (uint16x4_t __a, int16x4_t __b)
- {
--  return vset_lane_f64 (__builtin_floor (vget_lane_f64 (__a, 0)), __a, 0);
-+  return __builtin_aarch64_usqaddv4hi_uus (__a, __b);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vrndmq_f32 (float32x4_t __a)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqadd_u32 (uint32x2_t __a, int32x2_t __b)
- {
--  return __builtin_aarch64_floorv4sf (__a);
-+  return __builtin_aarch64_usqaddv2si_uus (__a, __b);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vrndmq_f64 (float64x2_t __a)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqadd_u64 (uint64x1_t __a, int64x1_t __b)
- {
--  return __builtin_aarch64_floorv2df (__a);
-+  return (uint64x1_t) {__builtin_aarch64_usqadddi_uus (__a[0], __b[0])};
- }
- 
--/* vrndn  */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vrndn_f32 (float32x2_t __a)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqaddq_u8 (uint8x16_t __a, int8x16_t __b)
- {
--  return __builtin_aarch64_frintnv2sf (__a);
-+  return __builtin_aarch64_usqaddv16qi_uus (__a, __b);
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vrndn_f64 (float64x1_t __a)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqaddq_u16 (uint16x8_t __a, int16x8_t __b)
- {
--  return (float64x1_t) {__builtin_aarch64_frintndf (__a[0])};
-+  return __builtin_aarch64_usqaddv8hi_uus (__a, __b);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vrndnq_f32 (float32x4_t __a)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqaddq_u32 (uint32x4_t __a, int32x4_t __b)
- {
--  return __builtin_aarch64_frintnv4sf (__a);
-+  return __builtin_aarch64_usqaddv4si_uus (__a, __b);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vrndnq_f64 (float64x2_t __a)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqaddq_u64 (uint64x2_t __a, int64x2_t __b)
- {
--  return __builtin_aarch64_frintnv2df (__a);
-+  return __builtin_aarch64_usqaddv2di_uus (__a, __b);
- }
- 
--/* vrndp  */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vrndp_f32 (float32x2_t __a)
-+__extension__ extern __inline uint8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqaddb_u8 (uint8_t __a, int8_t __b)
- {
--  return __builtin_aarch64_ceilv2sf (__a);
-+  return __builtin_aarch64_usqaddqi_uus (__a, __b);
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vrndp_f64 (float64x1_t __a)
-+__extension__ extern __inline uint16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqaddh_u16 (uint16_t __a, int16_t __b)
- {
--  return vset_lane_f64 (__builtin_ceil (vget_lane_f64 (__a, 0)), __a, 0);
-+  return __builtin_aarch64_usqaddhi_uus (__a, __b);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vrndpq_f32 (float32x4_t __a)
-+__extension__ extern __inline uint32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqadds_u32 (uint32_t __a, int32_t __b)
- {
--  return __builtin_aarch64_ceilv4sf (__a);
-+  return __builtin_aarch64_usqaddsi_uus (__a, __b);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vrndpq_f64 (float64x2_t __a)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqaddd_u64 (uint64_t __a, int64_t __b)
- {
--  return __builtin_aarch64_ceilv2df (__a);
-+  return __builtin_aarch64_usqadddi_uus (__a, __b);
- }
- 
--/* vrndx  */
--
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vrndx_f32 (float32x2_t __a)
-+/* vsqrt */
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqrt_f32 (float32x2_t a)
- {
--  return __builtin_aarch64_rintv2sf (__a);
-+  return __builtin_aarch64_sqrtv2sf (a);
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vrndx_f64 (float64x1_t __a)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqrtq_f32 (float32x4_t a)
- {
--  return vset_lane_f64 (__builtin_rint (vget_lane_f64 (__a, 0)), __a, 0);
-+  return __builtin_aarch64_sqrtv4sf (a);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vrndxq_f32 (float32x4_t __a)
-+__extension__ extern __inline float64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqrt_f64 (float64x1_t a)
- {
--  return __builtin_aarch64_rintv4sf (__a);
-+  return (float64x1_t) { __builtin_aarch64_sqrtdf (a[0]) };
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vrndxq_f64 (float64x2_t __a)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqrtq_f64 (float64x2_t a)
- {
--  return __builtin_aarch64_rintv2df (__a);
-+  return __builtin_aarch64_sqrtv2df (a);
- }
- 
--/* vrshl */
-+/* vsra */
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vrshl_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
- {
--  return (int8x8_t) __builtin_aarch64_srshlv8qi (__a, __b);
-+  return (int8x8_t) __builtin_aarch64_ssra_nv8qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vrshl_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
- {
--  return (int16x4_t) __builtin_aarch64_srshlv4hi (__a, __b);
-+  return (int16x4_t) __builtin_aarch64_ssra_nv4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vrshl_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
- {
--  return (int32x2_t) __builtin_aarch64_srshlv2si (__a, __b);
-+  return (int32x2_t) __builtin_aarch64_ssra_nv2si (__a, __b, __c);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vrshl_s64 (int64x1_t __a, int64x1_t __b)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
- {
--  return (int64x1_t) {__builtin_aarch64_srshldi (__a[0], __b[0])};
-+  return (int64x1_t) {__builtin_aarch64_ssra_ndi (__a[0], __b[0], __c)};
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vrshl_u8 (uint8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
- {
--  return __builtin_aarch64_urshlv8qi_uus (__a, __b);
-+  return __builtin_aarch64_usra_nv8qi_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vrshl_u16 (uint16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
- {
--  return __builtin_aarch64_urshlv4hi_uus (__a, __b);
-+  return __builtin_aarch64_usra_nv4hi_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vrshl_u32 (uint32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
- {
--  return __builtin_aarch64_urshlv2si_uus (__a, __b);
-+  return __builtin_aarch64_usra_nv2si_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vrshl_u64 (uint64x1_t __a, int64x1_t __b)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
- {
--  return (uint64x1_t) {__builtin_aarch64_urshldi_uus (__a[0], __b[0])};
-+  return (uint64x1_t) {__builtin_aarch64_usra_ndi_uuus (__a[0], __b[0], __c)};
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vrshlq_s8 (int8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
- {
--  return (int8x16_t) __builtin_aarch64_srshlv16qi (__a, __b);
-+  return (int8x16_t) __builtin_aarch64_ssra_nv16qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vrshlq_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
- {
--  return (int16x8_t) __builtin_aarch64_srshlv8hi (__a, __b);
-+  return (int16x8_t) __builtin_aarch64_ssra_nv8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vrshlq_s32 (int32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
- {
--  return (int32x4_t) __builtin_aarch64_srshlv4si (__a, __b);
-+  return (int32x4_t) __builtin_aarch64_ssra_nv4si (__a, __b, __c);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vrshlq_s64 (int64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
- {
--  return (int64x2_t) __builtin_aarch64_srshlv2di (__a, __b);
-+  return (int64x2_t) __builtin_aarch64_ssra_nv2di (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vrshlq_u8 (uint8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
- {
--  return __builtin_aarch64_urshlv16qi_uus (__a, __b);
-+  return __builtin_aarch64_usra_nv16qi_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vrshlq_u16 (uint16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
- {
--  return __builtin_aarch64_urshlv8hi_uus (__a, __b);
-+  return __builtin_aarch64_usra_nv8hi_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vrshlq_u32 (uint32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
- {
--  return __builtin_aarch64_urshlv4si_uus (__a, __b);
-+  return __builtin_aarch64_usra_nv4si_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vrshlq_u64 (uint64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
- {
--  return __builtin_aarch64_urshlv2di_uus (__a, __b);
-+  return __builtin_aarch64_usra_nv2di_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vrshld_s64 (int64_t __a, int64_t __b)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsrad_n_s64 (int64_t __a, int64_t __b, const int __c)
- {
--  return __builtin_aarch64_srshldi (__a, __b);
-+  return __builtin_aarch64_ssra_ndi (__a, __b, __c);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vrshld_u64 (uint64_t __a, int64_t __b)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsrad_n_u64 (uint64_t __a, uint64_t __b, const int __c)
- {
--  return __builtin_aarch64_urshldi_uus (__a, __b);
-+  return __builtin_aarch64_usra_ndi_uuus (__a, __b, __c);
- }
- 
--/* vrshr */
--
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vrshr_n_s8 (int8x8_t __a, const int __b)
--{
--  return (int8x8_t) __builtin_aarch64_srshr_nv8qi (__a, __b);
--}
-+/* vsri */
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vrshr_n_s16 (int16x4_t __a, const int __b)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsri_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
- {
--  return (int16x4_t) __builtin_aarch64_srshr_nv4hi (__a, __b);
-+  return (int8x8_t) __builtin_aarch64_ssri_nv8qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vrshr_n_s32 (int32x2_t __a, const int __b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsri_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
- {
--  return (int32x2_t) __builtin_aarch64_srshr_nv2si (__a, __b);
-+  return (int16x4_t) __builtin_aarch64_ssri_nv4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vrshr_n_s64 (int64x1_t __a, const int __b)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsri_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
- {
--  return (int64x1_t) {__builtin_aarch64_srshr_ndi (__a[0], __b)};
-+  return (int32x2_t) __builtin_aarch64_ssri_nv2si (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vrshr_n_u8 (uint8x8_t __a, const int __b)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsri_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
- {
--  return __builtin_aarch64_urshr_nv8qi_uus (__a, __b);
-+  return (int64x1_t) {__builtin_aarch64_ssri_ndi (__a[0], __b[0], __c)};
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vrshr_n_u16 (uint16x4_t __a, const int __b)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsri_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
- {
--  return __builtin_aarch64_urshr_nv4hi_uus (__a, __b);
-+  return __builtin_aarch64_usri_nv8qi_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vrshr_n_u32 (uint32x2_t __a, const int __b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsri_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
- {
--  return __builtin_aarch64_urshr_nv2si_uus (__a, __b);
-+  return __builtin_aarch64_usri_nv4hi_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vrshr_n_u64 (uint64x1_t __a, const int __b)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsri_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
- {
--  return (uint64x1_t) {__builtin_aarch64_urshr_ndi_uus (__a[0], __b)};
-+  return __builtin_aarch64_usri_nv2si_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vrshrq_n_s8 (int8x16_t __a, const int __b)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsri_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
- {
--  return (int8x16_t) __builtin_aarch64_srshr_nv16qi (__a, __b);
-+  return (uint64x1_t) {__builtin_aarch64_usri_ndi_uuus (__a[0], __b[0], __c)};
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vrshrq_n_s16 (int16x8_t __a, const int __b)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsriq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
- {
--  return (int16x8_t) __builtin_aarch64_srshr_nv8hi (__a, __b);
-+  return (int8x16_t) __builtin_aarch64_ssri_nv16qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vrshrq_n_s32 (int32x4_t __a, const int __b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsriq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
- {
--  return (int32x4_t) __builtin_aarch64_srshr_nv4si (__a, __b);
-+  return (int16x8_t) __builtin_aarch64_ssri_nv8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vrshrq_n_s64 (int64x2_t __a, const int __b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsriq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
- {
--  return (int64x2_t) __builtin_aarch64_srshr_nv2di (__a, __b);
-+  return (int32x4_t) __builtin_aarch64_ssri_nv4si (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vrshrq_n_u8 (uint8x16_t __a, const int __b)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsriq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
- {
--  return __builtin_aarch64_urshr_nv16qi_uus (__a, __b);
-+  return (int64x2_t) __builtin_aarch64_ssri_nv2di (__a, __b, __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vrshrq_n_u16 (uint16x8_t __a, const int __b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsriq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
- {
--  return __builtin_aarch64_urshr_nv8hi_uus (__a, __b);
-+  return __builtin_aarch64_usri_nv16qi_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vrshrq_n_u32 (uint32x4_t __a, const int __b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsriq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
- {
--  return __builtin_aarch64_urshr_nv4si_uus (__a, __b);
-+  return __builtin_aarch64_usri_nv8hi_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vrshrq_n_u64 (uint64x2_t __a, const int __b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsriq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
- {
--  return __builtin_aarch64_urshr_nv2di_uus (__a, __b);
-+  return __builtin_aarch64_usri_nv4si_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vrshrd_n_s64 (int64_t __a, const int __b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsriq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
- {
--  return __builtin_aarch64_srshr_ndi (__a, __b);
-+  return __builtin_aarch64_usri_nv2di_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vrshrd_n_u64 (uint64_t __a, const int __b)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsrid_n_s64 (int64_t __a, int64_t __b, const int __c)
- {
--  return __builtin_aarch64_urshr_ndi_uus (__a, __b);
-+  return __builtin_aarch64_ssri_ndi (__a, __b, __c);
- }
- 
--/* vrsra */
--
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vrsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsrid_n_u64 (uint64_t __a, uint64_t __b, const int __c)
- {
--  return (int8x8_t) __builtin_aarch64_srsra_nv8qi (__a, __b, __c);
-+  return __builtin_aarch64_usri_ndi_uuus (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vrsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
--{
--  return (int16x4_t) __builtin_aarch64_srsra_nv4hi (__a, __b, __c);
--}
-+/* vst1 */
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vrsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_f16 (float16_t *__a, float16x4_t __b)
- {
--  return (int32x2_t) __builtin_aarch64_srsra_nv2si (__a, __b, __c);
-+  __builtin_aarch64_st1v4hf (__a, __b);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vrsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_f32 (float32_t *a, float32x2_t b)
- {
--  return (int64x1_t) {__builtin_aarch64_srsra_ndi (__a[0], __b[0], __c)};
-+  __builtin_aarch64_st1v2sf ((__builtin_aarch64_simd_sf *) a, b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vrsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_f64 (float64_t *a, float64x1_t b)
- {
--  return __builtin_aarch64_ursra_nv8qi_uuus (__a, __b, __c);
-+  *a = b[0];
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vrsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_p8 (poly8_t *a, poly8x8_t b)
- {
--  return __builtin_aarch64_ursra_nv4hi_uuus (__a, __b, __c);
-+  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a,
-+			     (int8x8_t) b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vrsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_p16 (poly16_t *a, poly16x4_t b)
- {
--  return __builtin_aarch64_ursra_nv2si_uuus (__a, __b, __c);
-+  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a,
-+			     (int16x4_t) b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vrsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_p64 (poly64_t *a, poly64x1_t b)
- {
--  return (uint64x1_t) {__builtin_aarch64_ursra_ndi_uuus (__a[0], __b[0], __c)};
-+  *a = b[0];
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vrsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_s8 (int8_t *a, int8x8_t b)
- {
--  return (int8x16_t) __builtin_aarch64_srsra_nv16qi (__a, __b, __c);
-+  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a, b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vrsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_s16 (int16_t *a, int16x4_t b)
- {
--  return (int16x8_t) __builtin_aarch64_srsra_nv8hi (__a, __b, __c);
-+  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a, b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vrsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_s32 (int32_t *a, int32x2_t b)
- {
--  return (int32x4_t) __builtin_aarch64_srsra_nv4si (__a, __b, __c);
-+  __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a, b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vrsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_s64 (int64_t *a, int64x1_t b)
- {
--  return (int64x2_t) __builtin_aarch64_srsra_nv2di (__a, __b, __c);
-+  *a = b[0];
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vrsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_u8 (uint8_t *a, uint8x8_t b)
- {
--  return __builtin_aarch64_ursra_nv16qi_uuus (__a, __b, __c);
-+  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a,
-+			     (int8x8_t) b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vrsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_u16 (uint16_t *a, uint16x4_t b)
- {
--  return __builtin_aarch64_ursra_nv8hi_uuus (__a, __b, __c);
-+  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a,
-+			     (int16x4_t) b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vrsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_u32 (uint32_t *a, uint32x2_t b)
- {
--  return __builtin_aarch64_ursra_nv4si_uuus (__a, __b, __c);
-+  __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a,
-+			     (int32x2_t) b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vrsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_u64 (uint64_t *a, uint64x1_t b)
- {
--  return __builtin_aarch64_ursra_nv2di_uuus (__a, __b, __c);
-+  *a = b[0];
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vrsrad_n_s64 (int64_t __a, int64_t __b, const int __c)
-+/* vst1q */
-+
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_f16 (float16_t *__a, float16x8_t __b)
- {
--  return __builtin_aarch64_srsra_ndi (__a, __b, __c);
-+  __builtin_aarch64_st1v8hf (__a, __b);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vrsrad_n_u64 (uint64_t __a, uint64_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_f32 (float32_t *a, float32x4_t b)
- {
--  return __builtin_aarch64_ursra_ndi_uuus (__a, __b, __c);
-+  __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) a, b);
- }
- 
--#pragma GCC push_options
--#pragma GCC target ("+nothing+crypto")
--
--/* vsha1  */
--
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsha1cq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_f64 (float64_t *a, float64x2_t b)
- {
--  return __builtin_aarch64_crypto_sha1cv4si_uuuu (hash_abcd, hash_e, wk);
-+  __builtin_aarch64_st1v2df ((__builtin_aarch64_simd_df *) a, b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsha1mq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_p8 (poly8_t *a, poly8x16_t b)
- {
--  return __builtin_aarch64_crypto_sha1mv4si_uuuu (hash_abcd, hash_e, wk);
-+  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a,
-+			      (int8x16_t) b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsha1pq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_p16 (poly16_t *a, poly16x8_t b)
- {
--  return __builtin_aarch64_crypto_sha1pv4si_uuuu (hash_abcd, hash_e, wk);
-+  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a,
-+			     (int16x8_t) b);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vsha1h_u32 (uint32_t hash_e)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_p64 (poly64_t *a, poly64x2_t b)
- {
--  return __builtin_aarch64_crypto_sha1hsi_uu (hash_e);
-+  __builtin_aarch64_st1v2di_sp ((__builtin_aarch64_simd_di *) a,
-+				(poly64x2_t) b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsha1su0q_u32 (uint32x4_t w0_3, uint32x4_t w4_7, uint32x4_t w8_11)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_s8 (int8_t *a, int8x16_t b)
- {
--  return __builtin_aarch64_crypto_sha1su0v4si_uuuu (w0_3, w4_7, w8_11);
-+  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a, b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsha1su1q_u32 (uint32x4_t tw0_3, uint32x4_t w12_15)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_s16 (int16_t *a, int16x8_t b)
- {
--  return __builtin_aarch64_crypto_sha1su1v4si_uuu (tw0_3, w12_15);
-+  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a, b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsha256hq_u32 (uint32x4_t hash_abcd, uint32x4_t hash_efgh, uint32x4_t wk)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_s32 (int32_t *a, int32x4_t b)
- {
--  return __builtin_aarch64_crypto_sha256hv4si_uuuu (hash_abcd, hash_efgh, wk);
-+  __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a, b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsha256h2q_u32 (uint32x4_t hash_efgh, uint32x4_t hash_abcd, uint32x4_t wk)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_s64 (int64_t *a, int64x2_t b)
- {
--  return __builtin_aarch64_crypto_sha256h2v4si_uuuu (hash_efgh, hash_abcd, wk);
-+  __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a, b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsha256su0q_u32 (uint32x4_t w0_3, uint32x4_t w4_7)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_u8 (uint8_t *a, uint8x16_t b)
- {
--  return __builtin_aarch64_crypto_sha256su0v4si_uuu (w0_3, w4_7);
-+  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a,
-+			      (int8x16_t) b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsha256su1q_u32 (uint32x4_t tw0_3, uint32x4_t w8_11, uint32x4_t w12_15)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_u16 (uint16_t *a, uint16x8_t b)
- {
--  return __builtin_aarch64_crypto_sha256su1v4si_uuuu (tw0_3, w8_11, w12_15);
-+  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a,
-+			     (int16x8_t) b);
- }
- 
--__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
--vmull_p64 (poly64_t a, poly64_t b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_u32 (uint32_t *a, uint32x4_t b)
- {
--  return
--    __builtin_aarch64_crypto_pmulldi_ppp (a, b);
-+  __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a,
-+			     (int32x4_t) b);
- }
- 
--__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
--vmull_high_p64 (poly64x2_t a, poly64x2_t b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_u64 (uint64_t *a, uint64x2_t b)
- {
--  return __builtin_aarch64_crypto_pmullv2di_ppp (a, b);
-+  __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a,
-+			     (int64x2_t) b);
- }
- 
--#pragma GCC pop_options
-+/* vst1_lane */
- 
--/* vshl */
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_f16 (float16_t *__a, float16x4_t __b, const int __lane)
-+{
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
-+}
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vshl_n_s8 (int8x8_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_f32 (float32_t *__a, float32x2_t __b, const int __lane)
- {
--  return (int8x8_t) __builtin_aarch64_ashlv8qi (__a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vshl_n_s16 (int16x4_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_f64 (float64_t *__a, float64x1_t __b, const int __lane)
- {
--  return (int16x4_t) __builtin_aarch64_ashlv4hi (__a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vshl_n_s32 (int32x2_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_p8 (poly8_t *__a, poly8x8_t __b, const int __lane)
- {
--  return (int32x2_t) __builtin_aarch64_ashlv2si (__a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vshl_n_s64 (int64x1_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_p16 (poly16_t *__a, poly16x4_t __b, const int __lane)
- {
--  return (int64x1_t) {__builtin_aarch64_ashldi (__a[0], __b)};
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vshl_n_u8 (uint8x8_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_p64 (poly64_t *__a, poly64x1_t __b, const int __lane)
- {
--  return (uint8x8_t) __builtin_aarch64_ashlv8qi ((int8x8_t) __a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vshl_n_u16 (uint16x4_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_s8 (int8_t *__a, int8x8_t __b, const int __lane)
- {
--  return (uint16x4_t) __builtin_aarch64_ashlv4hi ((int16x4_t) __a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vshl_n_u32 (uint32x2_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_s16 (int16_t *__a, int16x4_t __b, const int __lane)
- {
--  return (uint32x2_t) __builtin_aarch64_ashlv2si ((int32x2_t) __a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vshl_n_u64 (uint64x1_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_s32 (int32_t *__a, int32x2_t __b, const int __lane)
- {
--  return (uint64x1_t) {__builtin_aarch64_ashldi ((int64_t) __a[0], __b)};
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vshlq_n_s8 (int8x16_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_s64 (int64_t *__a, int64x1_t __b, const int __lane)
- {
--  return (int8x16_t) __builtin_aarch64_ashlv16qi (__a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vshlq_n_s16 (int16x8_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_u8 (uint8_t *__a, uint8x8_t __b, const int __lane)
- {
--  return (int16x8_t) __builtin_aarch64_ashlv8hi (__a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vshlq_n_s32 (int32x4_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_u16 (uint16_t *__a, uint16x4_t __b, const int __lane)
- {
--  return (int32x4_t) __builtin_aarch64_ashlv4si (__a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vshlq_n_s64 (int64x2_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_u32 (uint32_t *__a, uint32x2_t __b, const int __lane)
- {
--  return (int64x2_t) __builtin_aarch64_ashlv2di (__a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vshlq_n_u8 (uint8x16_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1_lane_u64 (uint64_t *__a, uint64x1_t __b, const int __lane)
- {
--  return (uint8x16_t) __builtin_aarch64_ashlv16qi ((int8x16_t) __a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vshlq_n_u16 (uint16x8_t __a, const int __b)
-+/* vst1q_lane */
-+
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_f16 (float16_t *__a, float16x8_t __b, const int __lane)
- {
--  return (uint16x8_t) __builtin_aarch64_ashlv8hi ((int16x8_t) __a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vshlq_n_u32 (uint32x4_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_f32 (float32_t *__a, float32x4_t __b, const int __lane)
- {
--  return (uint32x4_t) __builtin_aarch64_ashlv4si ((int32x4_t) __a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vshlq_n_u64 (uint64x2_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_f64 (float64_t *__a, float64x2_t __b, const int __lane)
- {
--  return (uint64x2_t) __builtin_aarch64_ashlv2di ((int64x2_t) __a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vshld_n_s64 (int64_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_p8 (poly8_t *__a, poly8x16_t __b, const int __lane)
- {
--  return __builtin_aarch64_ashldi (__a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vshld_n_u64 (uint64_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_p16 (poly16_t *__a, poly16x8_t __b, const int __lane)
- {
--  return (uint64_t) __builtin_aarch64_ashldi (__a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vshl_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_p64 (poly64_t *__a, poly64x2_t __b, const int __lane)
- {
--  return __builtin_aarch64_sshlv8qi (__a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vshl_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_s8 (int8_t *__a, int8x16_t __b, const int __lane)
- {
--  return __builtin_aarch64_sshlv4hi (__a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vshl_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_s16 (int16_t *__a, int16x8_t __b, const int __lane)
- {
--  return __builtin_aarch64_sshlv2si (__a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vshl_s64 (int64x1_t __a, int64x1_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_s32 (int32_t *__a, int32x4_t __b, const int __lane)
- {
--  return (int64x1_t) {__builtin_aarch64_sshldi (__a[0], __b[0])};
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vshl_u8 (uint8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_s64 (int64_t *__a, int64x2_t __b, const int __lane)
- {
--  return __builtin_aarch64_ushlv8qi_uus (__a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vshl_u16 (uint16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_u8 (uint8_t *__a, uint8x16_t __b, const int __lane)
- {
--  return __builtin_aarch64_ushlv4hi_uus (__a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vshl_u32 (uint32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_u16 (uint16_t *__a, uint16x8_t __b, const int __lane)
- {
--  return __builtin_aarch64_ushlv2si_uus (__a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vshl_u64 (uint64x1_t __a, int64x1_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_u32 (uint32_t *__a, uint32x4_t __b, const int __lane)
- {
--  return (uint64x1_t) {__builtin_aarch64_ushldi_uus (__a[0], __b[0])};
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vshlq_s8 (int8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst1q_lane_u64 (uint64_t *__a, uint64x2_t __b, const int __lane)
- {
--  return __builtin_aarch64_sshlv16qi (__a, __b);
-+  *__a = __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vshlq_s16 (int16x8_t __a, int16x8_t __b)
-+/* vstn */
-+
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2_s64 (int64_t * __a, int64x1x2_t val)
- {
--  return __builtin_aarch64_sshlv8hi (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  int64x2x2_t temp;
-+  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
-+  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
-+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
-+  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vshlq_s32 (int32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2_u64 (uint64_t * __a, uint64x1x2_t val)
- {
--  return __builtin_aarch64_sshlv4si (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  uint64x2x2_t temp;
-+  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
-+  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vshlq_s64 (int64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2_f64 (float64_t * __a, float64x1x2_t val)
- {
--  return __builtin_aarch64_sshlv2di (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  float64x2x2_t temp;
-+  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[1], 1);
-+  __builtin_aarch64_st2df ((__builtin_aarch64_simd_df *) __a, __o);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vshlq_u8 (uint8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2_s8 (int8_t * __a, int8x8x2_t val)
- {
--  return __builtin_aarch64_ushlv16qi_uus (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  int8x16x2_t temp;
-+  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
-+  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
-+  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vshlq_u16 (uint16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2_p8 (poly8_t * __a, poly8x8x2_t val)
- {
--  return __builtin_aarch64_ushlv8hi_uus (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  poly8x16x2_t temp;
-+  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
-+  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vshlq_u32 (uint32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2_s16 (int16_t * __a, int16x4x2_t val)
- {
--  return __builtin_aarch64_ushlv4si_uus (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  int16x8x2_t temp;
-+  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
-+  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
-+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
-+  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vshlq_u64 (uint64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2_p16 (poly16_t * __a, poly16x4x2_t val)
- {
--  return __builtin_aarch64_ushlv2di_uus (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  poly16x8x2_t temp;
-+  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
-+  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vshld_s64 (int64_t __a, int64_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2_s32 (int32_t * __a, int32x2x2_t val)
- {
--  return __builtin_aarch64_sshldi (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  int32x4x2_t temp;
-+  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
-+  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
-+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
-+  __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vshld_u64 (uint64_t __a, uint64_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2_u8 (uint8_t * __a, uint8x8x2_t val)
- {
--  return __builtin_aarch64_ushldi_uus (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  uint8x16x2_t temp;
-+  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
-+  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vshll_high_n_s8 (int8x16_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2_u16 (uint16_t * __a, uint16x4x2_t val)
- {
--  return __builtin_aarch64_sshll2_nv16qi (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  uint16x8x2_t temp;
-+  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
-+  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vshll_high_n_s16 (int16x8_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2_u32 (uint32_t * __a, uint32x2x2_t val)
- {
--  return __builtin_aarch64_sshll2_nv8hi (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  uint32x4x2_t temp;
-+  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
-+  __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vshll_high_n_s32 (int32x4_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2_f16 (float16_t * __a, float16x4x2_t val)
- {
--  return __builtin_aarch64_sshll2_nv4si (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  float16x8x2_t temp;
-+  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[1], 1);
-+  __builtin_aarch64_st2v4hf (__a, __o);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vshll_high_n_u8 (uint8x16_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2_f32 (float32_t * __a, float32x2x2_t val)
- {
--  return (uint16x8_t) __builtin_aarch64_ushll2_nv16qi ((int8x16_t) __a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  float32x4x2_t temp;
-+  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[1], 1);
-+  __builtin_aarch64_st2v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vshll_high_n_u16 (uint16x8_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2_p64 (poly64_t * __a, poly64x1x2_t val)
- {
--  return (uint32x4_t) __builtin_aarch64_ushll2_nv8hi ((int16x8_t) __a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  poly64x2x2_t temp;
-+  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
-+					       (poly64x2_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
-+					       (poly64x2_t) temp.val[1], 1);
-+  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vshll_high_n_u32 (uint32x4_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2q_s8 (int8_t * __a, int8x16x2_t val)
- {
--  return (uint64x2_t) __builtin_aarch64_ushll2_nv4si ((int32x4_t) __a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
-+  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vshll_n_s8 (int8x8_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2q_p8 (poly8_t * __a, poly8x16x2_t val)
- {
--  return __builtin_aarch64_sshll_nv8qi (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
-+  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vshll_n_s16 (int16x4_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2q_s16 (int16_t * __a, int16x8x2_t val)
- {
--  return __builtin_aarch64_sshll_nv4hi (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
-+  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vshll_n_s32 (int32x2_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2q_p16 (poly16_t * __a, poly16x8x2_t val)
- {
--  return __builtin_aarch64_sshll_nv2si (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
-+  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vshll_n_u8 (uint8x8_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2q_s32 (int32_t * __a, int32x4x2_t val)
- {
--  return __builtin_aarch64_ushll_nv8qi_uus (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
-+  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vshll_n_u16 (uint16x4_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2q_s64 (int64_t * __a, int64x2x2_t val)
- {
--  return __builtin_aarch64_ushll_nv4hi_uus (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
-+  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vshll_n_u32 (uint32x2_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2q_u8 (uint8_t * __a, uint8x16x2_t val)
- {
--  return __builtin_aarch64_ushll_nv2si_uus (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
-+  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
- }
- 
--/* vshr */
--
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vshr_n_s8 (int8x8_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2q_u16 (uint16_t * __a, uint16x8x2_t val)
- {
--  return (int8x8_t) __builtin_aarch64_ashrv8qi (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
-+  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vshr_n_s16 (int16x4_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2q_u32 (uint32_t * __a, uint32x4x2_t val)
- {
--  return (int16x4_t) __builtin_aarch64_ashrv4hi (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
-+  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vshr_n_s32 (int32x2_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2q_u64 (uint64_t * __a, uint64x2x2_t val)
- {
--  return (int32x2_t) __builtin_aarch64_ashrv2si (__a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
-+  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vshr_n_s64 (int64x1_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2q_f16 (float16_t * __a, float16x8x2_t val)
- {
--  return (int64x1_t) {__builtin_aarch64_ashr_simddi (__a[0], __b)};
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[1], 1);
-+  __builtin_aarch64_st2v8hf (__a, __o);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vshr_n_u8 (uint8x8_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2q_f32 (float32_t * __a, float32x4x2_t val)
- {
--  return (uint8x8_t) __builtin_aarch64_lshrv8qi ((int8x8_t) __a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[1], 1);
-+  __builtin_aarch64_st2v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vshr_n_u16 (uint16x4_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2q_f64 (float64_t * __a, float64x2x2_t val)
- {
--  return (uint16x4_t) __builtin_aarch64_lshrv4hi ((int16x4_t) __a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[1], 1);
-+  __builtin_aarch64_st2v2df ((__builtin_aarch64_simd_df *) __a, __o);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vshr_n_u32 (uint32x2_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst2q_p64 (poly64_t * __a, poly64x2x2_t val)
- {
--  return (uint32x2_t) __builtin_aarch64_lshrv2si ((int32x2_t) __a, __b);
-+  __builtin_aarch64_simd_oi __o;
-+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
-+					       (poly64x2_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
-+					       (poly64x2_t) val.val[1], 1);
-+  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vshr_n_u64 (uint64x1_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3_s64 (int64_t * __a, int64x1x3_t val)
- {
--  return (uint64x1_t) {__builtin_aarch64_lshr_simddi_uus ( __a[0], __b)};
-+  __builtin_aarch64_simd_ci __o;
-+  int64x2x3_t temp;
-+  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
-+  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
-+  temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
-+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
-+  __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vshrq_n_s8 (int8x16_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3_u64 (uint64_t * __a, uint64x1x3_t val)
- {
--  return (int8x16_t) __builtin_aarch64_ashrv16qi (__a, __b);
-+  __builtin_aarch64_simd_ci __o;
-+  uint64x2x3_t temp;
-+  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
-+  temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
-+  __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vshrq_n_s16 (int16x8_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3_f64 (float64_t * __a, float64x1x3_t val)
- {
--  return (int16x8_t) __builtin_aarch64_ashrv8hi (__a, __b);
-+  __builtin_aarch64_simd_ci __o;
-+  float64x2x3_t temp;
-+  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
-+  temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[2], 2);
-+  __builtin_aarch64_st3df ((__builtin_aarch64_simd_df *) __a, __o);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vshrq_n_s32 (int32x4_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3_s8 (int8_t * __a, int8x8x3_t val)
- {
--  return (int32x4_t) __builtin_aarch64_ashrv4si (__a, __b);
-+  __builtin_aarch64_simd_ci __o;
-+  int8x16x3_t temp;
-+  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
-+  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
-+  temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
-+  __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vshrq_n_s64 (int64x2_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3_p8 (poly8_t * __a, poly8x8x3_t val)
- {
--  return (int64x2_t) __builtin_aarch64_ashrv2di (__a, __b);
-+  __builtin_aarch64_simd_ci __o;
-+  poly8x16x3_t temp;
-+  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
-+  temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
-+  __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vshrq_n_u8 (uint8x16_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3_s16 (int16_t * __a, int16x4x3_t val)
- {
--  return (uint8x16_t) __builtin_aarch64_lshrv16qi ((int8x16_t) __a, __b);
-+  __builtin_aarch64_simd_ci __o;
-+  int16x8x3_t temp;
-+  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
-+  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
-+  temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
-+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
-+  __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vshrq_n_u16 (uint16x8_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3_p16 (poly16_t * __a, poly16x4x3_t val)
- {
--  return (uint16x8_t) __builtin_aarch64_lshrv8hi ((int16x8_t) __a, __b);
-+  __builtin_aarch64_simd_ci __o;
-+  poly16x8x3_t temp;
-+  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
-+  temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
-+  __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vshrq_n_u32 (uint32x4_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3_s32 (int32_t * __a, int32x2x3_t val)
- {
--  return (uint32x4_t) __builtin_aarch64_lshrv4si ((int32x4_t) __a, __b);
-+  __builtin_aarch64_simd_ci __o;
-+  int32x4x3_t temp;
-+  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
-+  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
-+  temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
-+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
-+  __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vshrq_n_u64 (uint64x2_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3_u8 (uint8_t * __a, uint8x8x3_t val)
- {
--  return (uint64x2_t) __builtin_aarch64_lshrv2di ((int64x2_t) __a, __b);
-+  __builtin_aarch64_simd_ci __o;
-+  uint8x16x3_t temp;
-+  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
-+  temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
-+  __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vshrd_n_s64 (int64_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3_u16 (uint16_t * __a, uint16x4x3_t val)
- {
--  return __builtin_aarch64_ashr_simddi (__a, __b);
-+  __builtin_aarch64_simd_ci __o;
-+  uint16x8x3_t temp;
-+  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
-+  temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
-+  __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vshrd_n_u64 (uint64_t __a, const int __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3_u32 (uint32_t * __a, uint32x2x3_t val)
- {
--  return __builtin_aarch64_lshr_simddi_uus (__a, __b);
-+  __builtin_aarch64_simd_ci __o;
-+  uint32x4x3_t temp;
-+  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
-+  temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
-+  __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o);
- }
- 
--/* vsli */
--
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vsli_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3_f16 (float16_t * __a, float16x4x3_t val)
- {
--  return (int8x8_t) __builtin_aarch64_ssli_nv8qi (__a, __b, __c);
-+  __builtin_aarch64_simd_ci __o;
-+  float16x8x3_t temp;
-+  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
-+  temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[2], 2);
-+  __builtin_aarch64_st3v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vsli_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3_f32 (float32_t * __a, float32x2x3_t val)
- {
--  return (int16x4_t) __builtin_aarch64_ssli_nv4hi (__a, __b, __c);
-+  __builtin_aarch64_simd_ci __o;
-+  float32x4x3_t temp;
-+  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
-+  temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[2], 2);
-+  __builtin_aarch64_st3v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vsli_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3_p64 (poly64_t * __a, poly64x1x3_t val)
- {
--  return (int32x2_t) __builtin_aarch64_ssli_nv2si (__a, __b, __c);
-+  __builtin_aarch64_simd_ci __o;
-+  poly64x2x3_t temp;
-+  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
-+  temp.val[2] = vcombine_p64 (val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
-+					       (poly64x2_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
-+					       (poly64x2_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
-+					       (poly64x2_t) temp.val[2], 2);
-+  __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vsli_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3q_s8 (int8_t * __a, int8x16x3_t val)
- {
--  return (int64x1_t) {__builtin_aarch64_ssli_ndi (__a[0], __b[0], __c)};
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
-+  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vsli_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3q_p8 (poly8_t * __a, poly8x16x3_t val)
- {
--  return __builtin_aarch64_usli_nv8qi_uuus (__a, __b, __c);
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
-+  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vsli_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3q_s16 (int16_t * __a, int16x8x3_t val)
- {
--  return __builtin_aarch64_usli_nv4hi_uuus (__a, __b, __c);
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
-+  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vsli_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3q_p16 (poly16_t * __a, poly16x8x3_t val)
- {
--  return __builtin_aarch64_usli_nv2si_uuus (__a, __b, __c);
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
-+  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vsli_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3q_s32 (int32_t * __a, int32x4x3_t val)
- {
--  return (uint64x1_t) {__builtin_aarch64_usli_ndi_uuus (__a[0], __b[0], __c)};
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
-+  __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vsliq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3q_s64 (int64_t * __a, int64x2x3_t val)
- {
--  return (int8x16_t) __builtin_aarch64_ssli_nv16qi (__a, __b, __c);
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
-+  __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vsliq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3q_u8 (uint8_t * __a, uint8x16x3_t val)
- {
--  return (int16x8_t) __builtin_aarch64_ssli_nv8hi (__a, __b, __c);
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
-+  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vsliq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3q_u16 (uint16_t * __a, uint16x8x3_t val)
- {
--  return (int32x4_t) __builtin_aarch64_ssli_nv4si (__a, __b, __c);
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
-+  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vsliq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3q_u32 (uint32_t * __a, uint32x4x3_t val)
- {
--  return (int64x2_t) __builtin_aarch64_ssli_nv2di (__a, __b, __c);
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
-+  __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vsliq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3q_u64 (uint64_t * __a, uint64x2x3_t val)
- {
--  return __builtin_aarch64_usli_nv16qi_uuus (__a, __b, __c);
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
-+  __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vsliq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3q_f16 (float16_t * __a, float16x8x3_t val)
- {
--  return __builtin_aarch64_usli_nv8hi_uuus (__a, __b, __c);
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[2], 2);
-+  __builtin_aarch64_st3v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsliq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3q_f32 (float32_t * __a, float32x4x3_t val)
- {
--  return __builtin_aarch64_usli_nv4si_uuus (__a, __b, __c);
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[2], 2);
-+  __builtin_aarch64_st3v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vsliq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3q_f64 (float64_t * __a, float64x2x3_t val)
- {
--  return __builtin_aarch64_usli_nv2di_uuus (__a, __b, __c);
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[2], 2);
-+  __builtin_aarch64_st3v2df ((__builtin_aarch64_simd_df *) __a, __o);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vslid_n_s64 (int64_t __a, int64_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst3q_p64 (poly64_t * __a, poly64x2x3_t val)
- {
--  return __builtin_aarch64_ssli_ndi (__a, __b, __c);
-+  __builtin_aarch64_simd_ci __o;
-+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
-+					       (poly64x2_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
-+					       (poly64x2_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
-+					       (poly64x2_t) val.val[2], 2);
-+  __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vslid_n_u64 (uint64_t __a, uint64_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4_s64 (int64_t * __a, int64x1x4_t val)
- {
--  return __builtin_aarch64_usli_ndi_uuus (__a, __b, __c);
-+  __builtin_aarch64_simd_xi __o;
-+  int64x2x4_t temp;
-+  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
-+  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
-+  temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
-+  temp.val[3] = vcombine_s64 (val.val[3], vcreate_s64 (__AARCH64_INT64_C (0)));
-+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[3], 3);
-+  __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o);
- }
- 
--/* vsqadd */
--
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vsqadd_u8 (uint8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4_u64 (uint64_t * __a, uint64x1x4_t val)
- {
--  return __builtin_aarch64_usqaddv8qi_uus (__a, __b);
-+  __builtin_aarch64_simd_xi __o;
-+  uint64x2x4_t temp;
-+  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
-+  temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
-+  temp.val[3] = vcombine_u64 (val.val[3], vcreate_u64 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[3], 3);
-+  __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vsqadd_u16 (uint16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4_f64 (float64_t * __a, float64x1x4_t val)
- {
--  return __builtin_aarch64_usqaddv4hi_uus (__a, __b);
-+  __builtin_aarch64_simd_xi __o;
-+  float64x2x4_t temp;
-+  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
-+  temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
-+  temp.val[3] = vcombine_f64 (val.val[3], vcreate_f64 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[3], 3);
-+  __builtin_aarch64_st4df ((__builtin_aarch64_simd_df *) __a, __o);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vsqadd_u32 (uint32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4_s8 (int8_t * __a, int8x8x4_t val)
- {
--  return __builtin_aarch64_usqaddv2si_uus (__a, __b);
-+  __builtin_aarch64_simd_xi __o;
-+  int8x16x4_t temp;
-+  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
-+  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
-+  temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
-+  temp.val[3] = vcombine_s8 (val.val[3], vcreate_s8 (__AARCH64_INT64_C (0)));
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3);
-+  __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vsqadd_u64 (uint64x1_t __a, int64x1_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4_p8 (poly8_t * __a, poly8x8x4_t val)
- {
--  return (uint64x1_t) {__builtin_aarch64_usqadddi_uus (__a[0], __b[0])};
-+  __builtin_aarch64_simd_xi __o;
-+  poly8x16x4_t temp;
-+  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
-+  temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
-+  temp.val[3] = vcombine_p8 (val.val[3], vcreate_p8 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3);
-+  __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vsqaddq_u8 (uint8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4_s16 (int16_t * __a, int16x4x4_t val)
- {
--  return __builtin_aarch64_usqaddv16qi_uus (__a, __b);
-+  __builtin_aarch64_simd_xi __o;
-+  int16x8x4_t temp;
-+  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
-+  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
-+  temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
-+  temp.val[3] = vcombine_s16 (val.val[3], vcreate_s16 (__AARCH64_INT64_C (0)));
-+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3);
-+  __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vsqaddq_u16 (uint16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4_p16 (poly16_t * __a, poly16x4x4_t val)
- {
--  return __builtin_aarch64_usqaddv8hi_uus (__a, __b);
-+  __builtin_aarch64_simd_xi __o;
-+  poly16x8x4_t temp;
-+  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
-+  temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
-+  temp.val[3] = vcombine_p16 (val.val[3], vcreate_p16 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3);
-+  __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsqaddq_u32 (uint32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4_s32 (int32_t * __a, int32x2x4_t val)
- {
--  return __builtin_aarch64_usqaddv4si_uus (__a, __b);
-+  __builtin_aarch64_simd_xi __o;
-+  int32x4x4_t temp;
-+  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
-+  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
-+  temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
-+  temp.val[3] = vcombine_s32 (val.val[3], vcreate_s32 (__AARCH64_INT64_C (0)));
-+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[3], 3);
-+  __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vsqaddq_u64 (uint64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4_u8 (uint8_t * __a, uint8x8x4_t val)
- {
--  return __builtin_aarch64_usqaddv2di_uus (__a, __b);
-+  __builtin_aarch64_simd_xi __o;
-+  uint8x16x4_t temp;
-+  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
-+  temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
-+  temp.val[3] = vcombine_u8 (val.val[3], vcreate_u8 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3);
-+  __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
- }
- 
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
--vsqaddb_u8 (uint8_t __a, int8_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4_u16 (uint16_t * __a, uint16x4x4_t val)
- {
--  return __builtin_aarch64_usqaddqi_uus (__a, __b);
-+  __builtin_aarch64_simd_xi __o;
-+  uint16x8x4_t temp;
-+  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
-+  temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
-+  temp.val[3] = vcombine_u16 (val.val[3], vcreate_u16 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3);
-+  __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
- }
- 
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
--vsqaddh_u16 (uint16_t __a, int16_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4_u32 (uint32_t * __a, uint32x2x4_t val)
- {
--  return __builtin_aarch64_usqaddhi_uus (__a, __b);
-+  __builtin_aarch64_simd_xi __o;
-+  uint32x4x4_t temp;
-+  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
-+  temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
-+  temp.val[3] = vcombine_u32 (val.val[3], vcreate_u32 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[3], 3);
-+  __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vsqadds_u32 (uint32_t __a, int32_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4_f16 (float16_t * __a, float16x4x4_t val)
- {
--  return __builtin_aarch64_usqaddsi_uus (__a, __b);
-+  __builtin_aarch64_simd_xi __o;
-+  float16x8x4_t temp;
-+  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
-+  temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
-+  temp.val[3] = vcombine_f16 (val.val[3], vcreate_f16 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[3], 3);
-+  __builtin_aarch64_st4v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vsqaddd_u64 (uint64_t __a, int64_t __b)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4_f32 (float32_t * __a, float32x2x4_t val)
- {
--  return __builtin_aarch64_usqadddi_uus (__a, __b);
-+  __builtin_aarch64_simd_xi __o;
-+  float32x4x4_t temp;
-+  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
-+  temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
-+  temp.val[3] = vcombine_f32 (val.val[3], vcreate_f32 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[3], 3);
-+  __builtin_aarch64_st4v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
- }
- 
--/* vsqrt */
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vsqrt_f32 (float32x2_t a)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4_p64 (poly64_t * __a, poly64x1x4_t val)
- {
--  return __builtin_aarch64_sqrtv2sf (a);
-+  __builtin_aarch64_simd_xi __o;
-+  poly64x2x4_t temp;
-+  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
-+  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
-+  temp.val[2] = vcombine_p64 (val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
-+  temp.val[3] = vcombine_p64 (val.val[3], vcreate_p64 (__AARCH64_UINT64_C (0)));
-+  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
-+					       (poly64x2_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
-+					       (poly64x2_t) temp.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
-+					       (poly64x2_t) temp.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
-+					       (poly64x2_t) temp.val[3], 3);
-+  __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vsqrtq_f32 (float32x4_t a)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4q_s8 (int8_t * __a, int8x16x4_t val)
- {
--  return __builtin_aarch64_sqrtv4sf (a);
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
-+  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
- }
- 
--__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
--vsqrt_f64 (float64x1_t a)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4q_p8 (poly8_t * __a, poly8x16x4_t val)
- {
--  return (float64x1_t) { __builtin_aarch64_sqrtdf (a[0]) };
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
-+  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vsqrtq_f64 (float64x2_t a)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4q_s16 (int16_t * __a, int16x8x4_t val)
- {
--  return __builtin_aarch64_sqrtv2df (a);
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
-+  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
- }
- 
--/* vsra */
--
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4q_p16 (poly16_t * __a, poly16x8x4_t val)
- {
--  return (int8x8_t) __builtin_aarch64_ssra_nv8qi (__a, __b, __c);
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
-+  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4q_s32 (int32_t * __a, int32x4x4_t val)
- {
--  return (int16x4_t) __builtin_aarch64_ssra_nv4hi (__a, __b, __c);
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3);
-+  __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4q_s64 (int64_t * __a, int64x2x4_t val)
- {
--  return (int32x2_t) __builtin_aarch64_ssra_nv2si (__a, __b, __c);
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3);
-+  __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4q_u8 (uint8_t * __a, uint8x16x4_t val)
- {
--  return (int64x1_t) {__builtin_aarch64_ssra_ndi (__a[0], __b[0], __c)};
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
-+  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4q_u16 (uint16_t * __a, uint16x8x4_t val)
- {
--  return __builtin_aarch64_usra_nv8qi_uuus (__a, __b, __c);
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
-+  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4q_u32 (uint32_t * __a, uint32x4x4_t val)
- {
--  return __builtin_aarch64_usra_nv4hi_uuus (__a, __b, __c);
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3);
-+  __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4q_u64 (uint64_t * __a, uint64x2x4_t val)
- {
--  return __builtin_aarch64_usra_nv2si_uuus (__a, __b, __c);
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3);
-+  __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4q_f16 (float16_t * __a, float16x8x4_t val)
- {
--  return (uint64x1_t) {__builtin_aarch64_usra_ndi_uuus (__a[0], __b[0], __c)};
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[3], 3);
-+  __builtin_aarch64_st4v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4q_f32 (float32_t * __a, float32x4x4_t val)
- {
--  return (int8x16_t) __builtin_aarch64_ssra_nv16qi (__a, __b, __c);
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[3], 3);
-+  __builtin_aarch64_st4v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4q_f64 (float64_t * __a, float64x2x4_t val)
- {
--  return (int16x8_t) __builtin_aarch64_ssra_nv8hi (__a, __b, __c);
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[3], 3);
-+  __builtin_aarch64_st4v2df ((__builtin_aarch64_simd_df *) __a, __o);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
-+__extension__ extern __inline void
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vst4q_p64 (poly64_t * __a, poly64x2x4_t val)
- {
--  return (int32x4_t) __builtin_aarch64_ssra_nv4si (__a, __b, __c);
-+  __builtin_aarch64_simd_xi __o;
-+  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
-+					       (poly64x2_t) val.val[0], 0);
-+  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
-+					       (poly64x2_t) val.val[1], 1);
-+  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
-+					       (poly64x2_t) val.val[2], 2);
-+  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
-+					       (poly64x2_t) val.val[3], 3);
-+  __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
--{
--  return (int64x2_t) __builtin_aarch64_ssra_nv2di (__a, __b, __c);
--}
-+/* vsub */
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsubd_s64 (int64_t __a, int64_t __b)
- {
--  return __builtin_aarch64_usra_nv16qi_uuus (__a, __b, __c);
-+  return __a - __b;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsubd_u64 (uint64_t __a, uint64_t __b)
- {
--  return __builtin_aarch64_usra_nv8hi_uuus (__a, __b, __c);
-+  return __a - __b;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
--{
--  return __builtin_aarch64_usra_nv4si_uuus (__a, __b, __c);
--}
-+/* vtbx1  */
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtbx1_s8 (int8x8_t __r, int8x8_t __tab, int8x8_t __idx)
- {
--  return __builtin_aarch64_usra_nv2di_uuus (__a, __b, __c);
-+  uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx),
-+			      vmov_n_u8 (8));
-+  int8x8_t __tbl = vtbl1_s8 (__tab, __idx);
-+
-+  return vbsl_s8 (__mask, __tbl, __r);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vsrad_n_s64 (int64_t __a, int64_t __b, const int __c)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtbx1_u8 (uint8x8_t __r, uint8x8_t __tab, uint8x8_t __idx)
- {
--  return __builtin_aarch64_ssra_ndi (__a, __b, __c);
-+  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8));
-+  uint8x8_t __tbl = vtbl1_u8 (__tab, __idx);
-+
-+  return vbsl_u8 (__mask, __tbl, __r);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vsrad_n_u64 (uint64_t __a, uint64_t __b, const int __c)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtbx1_p8 (poly8x8_t __r, poly8x8_t __tab, uint8x8_t __idx)
- {
--  return __builtin_aarch64_usra_ndi_uuus (__a, __b, __c);
-+  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8));
-+  poly8x8_t __tbl = vtbl1_p8 (__tab, __idx);
-+
-+  return vbsl_p8 (__mask, __tbl, __r);
- }
- 
--/* vsri */
-+/* vtbx3  */
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vsri_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtbx3_s8 (int8x8_t __r, int8x8x3_t __tab, int8x8_t __idx)
- {
--  return (int8x8_t) __builtin_aarch64_ssri_nv8qi (__a, __b, __c);
-+  uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx),
-+			      vmov_n_u8 (24));
-+  int8x8_t __tbl = vtbl3_s8 (__tab, __idx);
-+
-+  return vbsl_s8 (__mask, __tbl, __r);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vsri_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtbx3_u8 (uint8x8_t __r, uint8x8x3_t __tab, uint8x8_t __idx)
- {
--  return (int16x4_t) __builtin_aarch64_ssri_nv4hi (__a, __b, __c);
-+  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24));
-+  uint8x8_t __tbl = vtbl3_u8 (__tab, __idx);
-+
-+  return vbsl_u8 (__mask, __tbl, __r);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vsri_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx)
- {
--  return (int32x2_t) __builtin_aarch64_ssri_nv2si (__a, __b, __c);
-+  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24));
-+  poly8x8_t __tbl = vtbl3_p8 (__tab, __idx);
-+
-+  return vbsl_p8 (__mask, __tbl, __r);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vsri_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
-+/* vtbx4  */
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx)
- {
--  return (int64x1_t) {__builtin_aarch64_ssri_ndi (__a[0], __b[0], __c)};
-+  int8x8_t result;
-+  int8x16x2_t temp;
-+  __builtin_aarch64_simd_oi __o;
-+  temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]);
-+  temp.val[1] = vcombine_s8 (__tab.val[2], __tab.val[3]);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-+					   (int8x16_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-+					   (int8x16_t) temp.val[1], 1);
-+  result = __builtin_aarch64_tbx4v8qi (__r, __o, __idx);
-+  return result;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vsri_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx)
- {
--  return __builtin_aarch64_usri_nv8qi_uuus (__a, __b, __c);
-+  uint8x8_t result;
-+  uint8x16x2_t temp;
-+  __builtin_aarch64_simd_oi __o;
-+  temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]);
-+  temp.val[1] = vcombine_u8 (__tab.val[2], __tab.val[3]);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-+					   (int8x16_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-+					   (int8x16_t) temp.val[1], 1);
-+  result = (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
-+						  (int8x8_t)__idx);
-+  return result;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vsri_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx)
- {
--  return __builtin_aarch64_usri_nv4hi_uuus (__a, __b, __c);
-+  poly8x8_t result;
-+  poly8x16x2_t temp;
-+  __builtin_aarch64_simd_oi __o;
-+  temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]);
-+  temp.val[1] = vcombine_p8 (__tab.val[2], __tab.val[3]);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-+					   (int8x16_t) temp.val[0], 0);
-+  __o = __builtin_aarch64_set_qregoiv16qi (__o,
-+					   (int8x16_t) temp.val[1], 1);
-+  result = (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
-+						  (int8x8_t)__idx);
-+  return result;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vsri_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
-+/* vtrn */
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn1_f16 (float16x4_t __a, float16x4_t __b)
- {
--  return __builtin_aarch64_usri_nv2si_uuus (__a, __b, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6});
-+#endif
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vsri_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn1_f32 (float32x2_t __a, float32x2_t __b)
- {
--  return (uint64x1_t) {__builtin_aarch64_usri_ndi_uuus (__a[0], __b[0], __c)};
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
-+#endif
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vsriq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn1_p8 (poly8x8_t __a, poly8x8_t __b)
- {
--  return (int8x16_t) __builtin_aarch64_ssri_nv16qi (__a, __b, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
-+#endif
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vsriq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn1_p16 (poly16x4_t __a, poly16x4_t __b)
- {
--  return (int16x8_t) __builtin_aarch64_ssri_nv8hi (__a, __b, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6});
-+#endif
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vsriq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn1_s8 (int8x8_t __a, int8x8_t __b)
- {
--  return (int32x4_t) __builtin_aarch64_ssri_nv4si (__a, __b, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
-+#endif
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vsriq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn1_s16 (int16x4_t __a, int16x4_t __b)
- {
--  return (int64x2_t) __builtin_aarch64_ssri_nv2di (__a, __b, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6});
-+#endif
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vsriq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn1_s32 (int32x2_t __a, int32x2_t __b)
- {
--  return __builtin_aarch64_usri_nv16qi_uuus (__a, __b, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
-+#endif
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vsriq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn1_u8 (uint8x8_t __a, uint8x8_t __b)
- {
--  return __builtin_aarch64_usri_nv8hi_uuus (__a, __b, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
-+#endif
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsriq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn1_u16 (uint16x4_t __a, uint16x4_t __b)
- {
--  return __builtin_aarch64_usri_nv4si_uuus (__a, __b, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6});
-+#endif
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vsriq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn1_u32 (uint32x2_t __a, uint32x2_t __b)
- {
--  return __builtin_aarch64_usri_nv2di_uuus (__a, __b, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
-+#endif
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vsrid_n_s64 (int64_t __a, int64_t __b, const int __c)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn1q_f16 (float16x8_t __a, float16x8_t __b)
- {
--  return __builtin_aarch64_ssri_ndi (__a, __b, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
-+#endif
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vsrid_n_u64 (uint64_t __a, uint64_t __b, const int __c)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn1q_f32 (float32x4_t __a, float32x4_t __b)
- {
--  return __builtin_aarch64_usri_ndi_uuus (__a, __b, __c);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 1, 7, 3});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 2, 6});
-+#endif
- }
- 
--/* vst1 */
--
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_f16 (float16_t *__a, float16x4_t __b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn1q_f64 (float64x2_t __a, float64x2_t __b)
- {
--  __builtin_aarch64_st1v4hf (__a, __b);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_f32 (float32_t *a, float32x2_t b)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn1q_p8 (poly8x16_t __a, poly8x16_t __b)
- {
--  __builtin_aarch64_st1v2sf ((__builtin_aarch64_simd_sf *) a, b);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x16_t) {17, 1, 19, 3, 21, 5, 23, 7, 25, 9, 27, 11, 29, 13, 31, 15});
-+#else
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x16_t) {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_f64 (float64_t *a, float64x1_t b)
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn1q_p16 (poly16x8_t __a, poly16x8_t __b)
- {
--  *a = b[0];
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_p8 (poly8_t *a, poly8x8_t b)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn1q_s8 (int8x16_t __a, int8x16_t __b)
- {
--  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a,
--			     (int8x8_t) b);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x16_t) {17, 1, 19, 3, 21, 5, 23, 7, 25, 9, 27, 11, 29, 13, 31, 15});
-+#else
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x16_t) {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_p16 (poly16_t *a, poly16x4_t b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn1q_s16 (int16x8_t __a, int16x8_t __b)
- {
--  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a,
--			     (int16x4_t) b);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_s8 (int8_t *a, int8x8_t b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn1q_s32 (int32x4_t __a, int32x4_t __b)
- {
--  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a, b);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 1, 7, 3});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 2, 6});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_s16 (int16_t *a, int16x4_t b)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn1q_s64 (int64x2_t __a, int64x2_t __b)
- {
--  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a, b);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_s32 (int32_t *a, int32x2_t b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn1q_u8 (uint8x16_t __a, uint8x16_t __b)
- {
--  __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a, b);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x16_t) {17, 1, 19, 3, 21, 5, 23, 7, 25, 9, 27, 11, 29, 13, 31, 15});
-+#else
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x16_t) {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_s64 (int64_t *a, int64x1_t b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn1q_u16 (uint16x8_t __a, uint16x8_t __b)
- {
--  *a = b[0];
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_u8 (uint8_t *a, uint8x8_t b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn1q_u32 (uint32x4_t __a, uint32x4_t __b)
- {
--  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a,
--			     (int8x8_t) b);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 1, 7, 3});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 2, 6});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_u16 (uint16_t *a, uint16x4_t b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn1q_u64 (uint64x2_t __a, uint64x2_t __b)
- {
--  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a,
--			     (int16x4_t) b);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_u32 (uint32_t *a, uint32x2_t b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn2_f16 (float16x4_t __a, float16x4_t __b)
- {
--  __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a,
--			     (int32x2_t) b);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_u64 (uint64_t *a, uint64x1_t b)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn2_f32 (float32x2_t __a, float32x2_t __b)
- {
--  *a = b[0];
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
-+#endif
- }
- 
--/* vst1q */
--
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_f16 (float16_t *__a, float16x8_t __b)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn2_p8 (poly8x8_t __a, poly8x8_t __b)
- {
--  __builtin_aarch64_st1v8hf (__a, __b);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_f32 (float32_t *a, float32x4_t b)
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn2_p16 (poly16x4_t __a, poly16x4_t __b)
- {
--  __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) a, b);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_f64 (float64_t *a, float64x2_t b)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn2_s8 (int8x8_t __a, int8x8_t __b)
- {
--  __builtin_aarch64_st1v2df ((__builtin_aarch64_simd_df *) a, b);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_p8 (poly8_t *a, poly8x16_t b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn2_s16 (int16x4_t __a, int16x4_t __b)
- {
--  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a,
--			      (int8x16_t) b);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_p16 (poly16_t *a, poly16x8_t b)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn2_s32 (int32x2_t __a, int32x2_t __b)
- {
--  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a,
--			     (int16x8_t) b);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_s8 (int8_t *a, int8x16_t b)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn2_u8 (uint8x8_t __a, uint8x8_t __b)
- {
--  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a, b);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_s16 (int16_t *a, int16x8_t b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn2_u16 (uint16x4_t __a, uint16x4_t __b)
- {
--  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a, b);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_s32 (int32_t *a, int32x4_t b)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn2_u32 (uint32x2_t __a, uint32x2_t __b)
- {
--  __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a, b);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_s64 (int64_t *a, int64x2_t b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn2q_f16 (float16x8_t __a, float16x8_t __b)
- {
--  __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a, b);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_u8 (uint8_t *a, uint8x16_t b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn2q_f32 (float32x4_t __a, float32x4_t __b)
- {
--  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a,
--			      (int8x16_t) b);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 6, 2});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 5, 3, 7});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_u16 (uint16_t *a, uint16x8_t b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn2q_f64 (float64x2_t __a, float64x2_t __b)
- {
--  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a,
--			     (int16x8_t) b);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_u32 (uint32_t *a, uint32x4_t b)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn2q_p8 (poly8x16_t __a, poly8x16_t __b)
- {
--  __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a,
--			     (int32x4_t) b);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x16_t) {16, 0, 18, 2, 20, 4, 22, 6, 24, 8, 26, 10, 28, 12, 30, 14});
-+#else
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x16_t) {1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_u64 (uint64_t *a, uint64x2_t b)
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn2q_p16 (poly16x8_t __a, poly16x8_t __b)
- {
--  __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a,
--			     (int64x2_t) b);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
-+#endif
- }
- 
--/* vst1_lane */
--
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_lane_f16 (float16_t *__a, float16x4_t __b, const int __lane)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn2q_s8 (int8x16_t __a, int8x16_t __b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x16_t) {16, 0, 18, 2, 20, 4, 22, 6, 24, 8, 26, 10, 28, 12, 30, 14});
-+#else
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x16_t) {1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_lane_f32 (float32_t *__a, float32x2_t __b, const int __lane)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn2q_s16 (int16x8_t __a, int16x8_t __b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_lane_f64 (float64_t *__a, float64x1_t __b, const int __lane)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn2q_s32 (int32x4_t __a, int32x4_t __b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 6, 2});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 5, 3, 7});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_lane_p8 (poly8_t *__a, poly8x8_t __b, const int __lane)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn2q_s64 (int64x2_t __a, int64x2_t __b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_lane_p16 (poly16_t *__a, poly16x4_t __b, const int __lane)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn2q_u8 (uint8x16_t __a, uint8x16_t __b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x16_t) {16, 0, 18, 2, 20, 4, 22, 6, 24, 8, 26, 10, 28, 12, 30, 14});
-+#else
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x16_t) {1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_lane_s8 (int8_t *__a, int8x8_t __b, const int __lane)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn2q_u16 (uint16x8_t __a, uint16x8_t __b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_lane_s16 (int16_t *__a, int16x4_t __b, const int __lane)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn2q_u32 (uint32x4_t __a, uint32x4_t __b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 6, 2});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 5, 3, 7});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_lane_s32 (int32_t *__a, int32x2_t __b, const int __lane)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn2q_u64 (uint64x2_t __a, uint64x2_t __b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_lane_s64 (int64_t *__a, int64x1_t __b, const int __lane)
-+__extension__ extern __inline float16x4x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn_f16 (float16x4_t __a, float16x4_t __b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+  return (float16x4x2_t) {vtrn1_f16 (__a, __b), vtrn2_f16 (__a, __b)};
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_lane_u8 (uint8_t *__a, uint8x8_t __b, const int __lane)
-+__extension__ extern __inline float32x2x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn_f32 (float32x2_t a, float32x2_t b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+  return (float32x2x2_t) {vtrn1_f32 (a, b), vtrn2_f32 (a, b)};
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_lane_u16 (uint16_t *__a, uint16x4_t __b, const int __lane)
-+__extension__ extern __inline poly8x8x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn_p8 (poly8x8_t a, poly8x8_t b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+  return (poly8x8x2_t) {vtrn1_p8 (a, b), vtrn2_p8 (a, b)};
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_lane_u32 (uint32_t *__a, uint32x2_t __b, const int __lane)
-+__extension__ extern __inline poly16x4x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn_p16 (poly16x4_t a, poly16x4_t b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+  return (poly16x4x2_t) {vtrn1_p16 (a, b), vtrn2_p16 (a, b)};
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1_lane_u64 (uint64_t *__a, uint64x1_t __b, const int __lane)
-+__extension__ extern __inline int8x8x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn_s8 (int8x8_t a, int8x8_t b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+  return (int8x8x2_t) {vtrn1_s8 (a, b), vtrn2_s8 (a, b)};
- }
- 
--/* vst1q_lane */
--
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_lane_f16 (float16_t *__a, float16x8_t __b, const int __lane)
-+__extension__ extern __inline int16x4x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn_s16 (int16x4_t a, int16x4_t b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+  return (int16x4x2_t) {vtrn1_s16 (a, b), vtrn2_s16 (a, b)};
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_lane_f32 (float32_t *__a, float32x4_t __b, const int __lane)
-+__extension__ extern __inline int32x2x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn_s32 (int32x2_t a, int32x2_t b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+  return (int32x2x2_t) {vtrn1_s32 (a, b), vtrn2_s32 (a, b)};
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_lane_f64 (float64_t *__a, float64x2_t __b, const int __lane)
-+__extension__ extern __inline uint8x8x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn_u8 (uint8x8_t a, uint8x8_t b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+  return (uint8x8x2_t) {vtrn1_u8 (a, b), vtrn2_u8 (a, b)};
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_lane_p8 (poly8_t *__a, poly8x16_t __b, const int __lane)
-+__extension__ extern __inline uint16x4x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn_u16 (uint16x4_t a, uint16x4_t b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+  return (uint16x4x2_t) {vtrn1_u16 (a, b), vtrn2_u16 (a, b)};
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_lane_p16 (poly16_t *__a, poly16x8_t __b, const int __lane)
-+__extension__ extern __inline uint32x2x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn_u32 (uint32x2_t a, uint32x2_t b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+  return (uint32x2x2_t) {vtrn1_u32 (a, b), vtrn2_u32 (a, b)};
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_lane_s8 (int8_t *__a, int8x16_t __b, const int __lane)
-+__extension__ extern __inline float16x8x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrnq_f16 (float16x8_t __a, float16x8_t __b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+  return (float16x8x2_t) {vtrn1q_f16 (__a, __b), vtrn2q_f16 (__a, __b)};
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_lane_s16 (int16_t *__a, int16x8_t __b, const int __lane)
-+__extension__ extern __inline float32x4x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrnq_f32 (float32x4_t a, float32x4_t b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+  return (float32x4x2_t) {vtrn1q_f32 (a, b), vtrn2q_f32 (a, b)};
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_lane_s32 (int32_t *__a, int32x4_t __b, const int __lane)
-+__extension__ extern __inline poly8x16x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrnq_p8 (poly8x16_t a, poly8x16_t b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+  return (poly8x16x2_t) {vtrn1q_p8 (a, b), vtrn2q_p8 (a, b)};
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_lane_s64 (int64_t *__a, int64x2_t __b, const int __lane)
-+__extension__ extern __inline poly16x8x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrnq_p16 (poly16x8_t a, poly16x8_t b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+  return (poly16x8x2_t) {vtrn1q_p16 (a, b), vtrn2q_p16 (a, b)};
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_lane_u8 (uint8_t *__a, uint8x16_t __b, const int __lane)
-+__extension__ extern __inline int8x16x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrnq_s8 (int8x16_t a, int8x16_t b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+  return (int8x16x2_t) {vtrn1q_s8 (a, b), vtrn2q_s8 (a, b)};
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_lane_u16 (uint16_t *__a, uint16x8_t __b, const int __lane)
-+__extension__ extern __inline int16x8x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrnq_s16 (int16x8_t a, int16x8_t b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+  return (int16x8x2_t) {vtrn1q_s16 (a, b), vtrn2q_s16 (a, b)};
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_lane_u32 (uint32_t *__a, uint32x4_t __b, const int __lane)
-+__extension__ extern __inline int32x4x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrnq_s32 (int32x4_t a, int32x4_t b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+  return (int32x4x2_t) {vtrn1q_s32 (a, b), vtrn2q_s32 (a, b)};
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst1q_lane_u64 (uint64_t *__a, uint64x2_t __b, const int __lane)
-+__extension__ extern __inline uint8x16x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrnq_u8 (uint8x16_t a, uint8x16_t b)
- {
--  *__a = __aarch64_vget_lane_any (__b, __lane);
-+  return (uint8x16x2_t) {vtrn1q_u8 (a, b), vtrn2q_u8 (a, b)};
- }
- 
--/* vstn */
--
--__extension__ static __inline void
--vst2_s64 (int64_t * __a, int64x1x2_t val)
-+__extension__ extern __inline uint16x8x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrnq_u16 (uint16x8_t a, uint16x8_t b)
- {
--  __builtin_aarch64_simd_oi __o;
--  int64x2x2_t temp;
--  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
--  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
--  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
--  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
-+  return (uint16x8x2_t) {vtrn1q_u16 (a, b), vtrn2q_u16 (a, b)};
- }
- 
--__extension__ static __inline void
--vst2_u64 (uint64_t * __a, uint64x1x2_t val)
-+__extension__ extern __inline uint32x4x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrnq_u32 (uint32x4_t a, uint32x4_t b)
- {
--  __builtin_aarch64_simd_oi __o;
--  uint64x2x2_t temp;
--  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
--  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
-+  return (uint32x4x2_t) {vtrn1q_u32 (a, b), vtrn2q_u32 (a, b)};
- }
- 
--__extension__ static __inline void
--vst2_f64 (float64_t * __a, float64x1x2_t val)
--{
--  __builtin_aarch64_simd_oi __o;
--  float64x2x2_t temp;
--  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[1], 1);
--  __builtin_aarch64_st2df ((__builtin_aarch64_simd_df *) __a, __o);
--}
-+/* vtst */
- 
--__extension__ static __inline void
--vst2_s8 (int8_t * __a, int8x8x2_t val)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtst_s8 (int8x8_t __a, int8x8_t __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  int8x16x2_t temp;
--  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
--  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
--  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
-+  return (uint8x8_t) ((__a & __b) != 0);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst2_p8 (poly8_t * __a, poly8x8x2_t val)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtst_s16 (int16x4_t __a, int16x4_t __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  poly8x16x2_t temp;
--  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
--  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
-+  return (uint16x4_t) ((__a & __b) != 0);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst2_s16 (int16_t * __a, int16x4x2_t val)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtst_s32 (int32x2_t __a, int32x2_t __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  int16x8x2_t temp;
--  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
--  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
--  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
--  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
-+  return (uint32x2_t) ((__a & __b) != 0);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst2_p16 (poly16_t * __a, poly16x4x2_t val)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtst_s64 (int64x1_t __a, int64x1_t __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  poly16x8x2_t temp;
--  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
--  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
-+  return (uint64x1_t) ((__a & __b) != __AARCH64_INT64_C (0));
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst2_s32 (int32_t * __a, int32x2x2_t val)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtst_u8 (uint8x8_t __a, uint8x8_t __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  int32x4x2_t temp;
--  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
--  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
--  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
--  __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o);
-+  return ((__a & __b) != 0);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst2_u8 (uint8_t * __a, uint8x8x2_t val)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtst_u16 (uint16x4_t __a, uint16x4_t __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  uint8x16x2_t temp;
--  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
--  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
-+  return ((__a & __b) != 0);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst2_u16 (uint16_t * __a, uint16x4x2_t val)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtst_u32 (uint32x2_t __a, uint32x2_t __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  uint16x8x2_t temp;
--  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
--  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
-+  return ((__a & __b) != 0);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst2_u32 (uint32_t * __a, uint32x2x2_t val)
-+__extension__ extern __inline uint64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtst_u64 (uint64x1_t __a, uint64x1_t __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  uint32x4x2_t temp;
--  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
--  __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o);
-+  return ((__a & __b) != __AARCH64_UINT64_C (0));
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst2_f16 (float16_t * __a, float16x4x2_t val)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtstq_s8 (int8x16_t __a, int8x16_t __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  float16x8x2_t temp;
--  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[1], 1);
--  __builtin_aarch64_st2v4hf (__a, __o);
-+  return (uint8x16_t) ((__a & __b) != 0);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst2_f32 (float32_t * __a, float32x2x2_t val)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtstq_s16 (int16x8_t __a, int16x8_t __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  float32x4x2_t temp;
--  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[1], 1);
--  __builtin_aarch64_st2v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
-+  return (uint16x8_t) ((__a & __b) != 0);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst2q_s8 (int8_t * __a, int8x16x2_t val)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtstq_s32 (int32x4_t __a, int32x4_t __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
--  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
-+  return (uint32x4_t) ((__a & __b) != 0);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst2q_p8 (poly8_t * __a, poly8x16x2_t val)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtstq_s64 (int64x2_t __a, int64x2_t __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
--  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
-+  return (uint64x2_t) ((__a & __b) != __AARCH64_INT64_C (0));
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst2q_s16 (int16_t * __a, int16x8x2_t val)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtstq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
--  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
-+  return ((__a & __b) != 0);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst2q_p16 (poly16_t * __a, poly16x8x2_t val)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtstq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
--  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
-+  return ((__a & __b) != 0);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst2q_s32 (int32_t * __a, int32x4x2_t val)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtstq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
--  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o);
-+  return ((__a & __b) != 0);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst2q_s64 (int64_t * __a, int64x2x2_t val)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtstq_u64 (uint64x2_t __a, uint64x2_t __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
--  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
-+  return ((__a & __b) != __AARCH64_UINT64_C (0));
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst2q_u8 (uint8_t * __a, uint8x16x2_t val)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtstd_s64 (int64_t __a, int64_t __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
--  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
-+  return (__a & __b) ? -1ll : 0ll;
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst2q_u16 (uint16_t * __a, uint16x8x2_t val)
-+__extension__ extern __inline uint64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vtstd_u64 (uint64_t __a, uint64_t __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
--  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
-+  return (__a & __b) ? -1ll : 0ll;
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst2q_u32 (uint32_t * __a, uint32x4x2_t val)
-+/* vuqadd */
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuqadd_s8 (int8x8_t __a, uint8x8_t __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
--  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o);
-+  return __builtin_aarch64_suqaddv8qi_ssu (__a,  __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst2q_u64 (uint64_t * __a, uint64x2x2_t val)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuqadd_s16 (int16x4_t __a, uint16x4_t __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
--  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
-+  return __builtin_aarch64_suqaddv4hi_ssu (__a,  __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst2q_f16 (float16_t * __a, float16x8x2_t val)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuqadd_s32 (int32x2_t __a, uint32x2_t __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[1], 1);
--  __builtin_aarch64_st2v8hf (__a, __o);
-+  return __builtin_aarch64_suqaddv2si_ssu (__a,  __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst2q_f32 (float32_t * __a, float32x4x2_t val)
-+__extension__ extern __inline int64x1_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuqadd_s64 (int64x1_t __a, uint64x1_t __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[1], 1);
--  __builtin_aarch64_st2v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
-+  return (int64x1_t) {__builtin_aarch64_suqadddi_ssu (__a[0], __b[0])};
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst2q_f64 (float64_t * __a, float64x2x2_t val)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuqaddq_s8 (int8x16_t __a, uint8x16_t __b)
- {
--  __builtin_aarch64_simd_oi __o;
--  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[1], 1);
--  __builtin_aarch64_st2v2df ((__builtin_aarch64_simd_df *) __a, __o);
-+  return __builtin_aarch64_suqaddv16qi_ssu (__a,  __b);
- }
- 
--__extension__ static __inline void
--vst3_s64 (int64_t * __a, int64x1x3_t val)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuqaddq_s16 (int16x8_t __a, uint16x8_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  int64x2x3_t temp;
--  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
--  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
--  temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
--  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
--  __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o);
-+  return __builtin_aarch64_suqaddv8hi_ssu (__a,  __b);
- }
- 
--__extension__ static __inline void
--vst3_u64 (uint64_t * __a, uint64x1x3_t val)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuqaddq_s32 (int32x4_t __a, uint32x4_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  uint64x2x3_t temp;
--  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
--  temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
--  __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o);
-+  return __builtin_aarch64_suqaddv4si_ssu (__a,  __b);
- }
- 
--__extension__ static __inline void
--vst3_f64 (float64_t * __a, float64x1x3_t val)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuqaddq_s64 (int64x2_t __a, uint64x2_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  float64x2x3_t temp;
--  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
--  temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[2], 2);
--  __builtin_aarch64_st3df ((__builtin_aarch64_simd_df *) __a, __o);
-+  return __builtin_aarch64_suqaddv2di_ssu (__a,  __b);
- }
- 
--__extension__ static __inline void
--vst3_s8 (int8_t * __a, int8x8x3_t val)
-+__extension__ extern __inline int8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuqaddb_s8 (int8_t __a, uint8_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  int8x16x3_t temp;
--  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
--  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
--  temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
--  __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
-+  return __builtin_aarch64_suqaddqi_ssu (__a,  __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst3_p8 (poly8_t * __a, poly8x8x3_t val)
-+__extension__ extern __inline int16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuqaddh_s16 (int16_t __a, uint16_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  poly8x16x3_t temp;
--  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
--  temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
--  __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
-+  return __builtin_aarch64_suqaddhi_ssu (__a,  __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst3_s16 (int16_t * __a, int16x4x3_t val)
-+__extension__ extern __inline int32_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuqadds_s32 (int32_t __a, uint32_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  int16x8x3_t temp;
--  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
--  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
--  temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
--  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
--  __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
-+  return __builtin_aarch64_suqaddsi_ssu (__a,  __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst3_p16 (poly16_t * __a, poly16x4x3_t val)
-+__extension__ extern __inline int64_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuqaddd_s64 (int64_t __a, uint64_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  poly16x8x3_t temp;
--  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
--  temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
--  __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
-+  return __builtin_aarch64_suqadddi_ssu (__a,  __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst3_s32 (int32_t * __a, int32x2x3_t val)
-+#define __DEFINTERLEAVE(op, rettype, intype, funcsuffix, Q) 		\
-+  __extension__ extern __inline rettype					\
-+  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
-+  v ## op ## Q ## _ ## funcsuffix (intype a, intype b)			\
-+  {									\
-+    return (rettype) {v ## op ## 1 ## Q ## _ ## funcsuffix (a, b),	\
-+		      v ## op ## 2 ## Q ## _ ## funcsuffix (a, b)};	\
-+  }
-+
-+#define __INTERLEAVE_LIST(op)					\
-+  __DEFINTERLEAVE (op, float16x4x2_t, float16x4_t, f16,)	\
-+  __DEFINTERLEAVE (op, float32x2x2_t, float32x2_t, f32,)	\
-+  __DEFINTERLEAVE (op, poly8x8x2_t, poly8x8_t, p8,)		\
-+  __DEFINTERLEAVE (op, poly16x4x2_t, poly16x4_t, p16,)		\
-+  __DEFINTERLEAVE (op, int8x8x2_t, int8x8_t, s8,)		\
-+  __DEFINTERLEAVE (op, int16x4x2_t, int16x4_t, s16,)		\
-+  __DEFINTERLEAVE (op, int32x2x2_t, int32x2_t, s32,)		\
-+  __DEFINTERLEAVE (op, uint8x8x2_t, uint8x8_t, u8,)		\
-+  __DEFINTERLEAVE (op, uint16x4x2_t, uint16x4_t, u16,)		\
-+  __DEFINTERLEAVE (op, uint32x2x2_t, uint32x2_t, u32,)		\
-+  __DEFINTERLEAVE (op, float16x8x2_t, float16x8_t, f16, q)	\
-+  __DEFINTERLEAVE (op, float32x4x2_t, float32x4_t, f32, q)	\
-+  __DEFINTERLEAVE (op, poly8x16x2_t, poly8x16_t, p8, q)		\
-+  __DEFINTERLEAVE (op, poly16x8x2_t, poly16x8_t, p16, q)	\
-+  __DEFINTERLEAVE (op, int8x16x2_t, int8x16_t, s8, q)		\
-+  __DEFINTERLEAVE (op, int16x8x2_t, int16x8_t, s16, q)		\
-+  __DEFINTERLEAVE (op, int32x4x2_t, int32x4_t, s32, q)		\
-+  __DEFINTERLEAVE (op, uint8x16x2_t, uint8x16_t, u8, q)		\
-+  __DEFINTERLEAVE (op, uint16x8x2_t, uint16x8_t, u16, q)	\
-+  __DEFINTERLEAVE (op, uint32x4x2_t, uint32x4_t, u32, q)
-+
-+/* vuzp */
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp1_f16 (float16x4_t __a, float16x4_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  int32x4x3_t temp;
--  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
--  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
--  temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
--  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
--  __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst3_u8 (uint8_t * __a, uint8x8x3_t val)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp1_f32 (float32x2_t __a, float32x2_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  uint8x16x3_t temp;
--  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
--  temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
--  __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst3_u16 (uint16_t * __a, uint16x4x3_t val)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp1_p8 (poly8x8_t __a, poly8x8_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  uint16x8x3_t temp;
--  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
--  temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
--  __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst3_u32 (uint32_t * __a, uint32x2x3_t val)
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp1_p16 (poly16x4_t __a, poly16x4_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  uint32x4x3_t temp;
--  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
--  temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
--  __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst3_f16 (float16_t * __a, float16x4x3_t val)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp1_s8 (int8x8_t __a, int8x8_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  float16x8x3_t temp;
--  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
--  temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[2], 2);
--  __builtin_aarch64_st3v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst3_f32 (float32_t * __a, float32x2x3_t val)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp1_s16 (int16x4_t __a, int16x4_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  float32x4x3_t temp;
--  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
--  temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[2], 2);
--  __builtin_aarch64_st3v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst3q_s8 (int8_t * __a, int8x16x3_t val)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp1_s32 (int32x2_t __a, int32x2_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
--  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst3q_p8 (poly8_t * __a, poly8x16x3_t val)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp1_u8 (uint8x8_t __a, uint8x8_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
--  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst3q_s16 (int16_t * __a, int16x8x3_t val)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp1_u16 (uint16x4_t __a, uint16x4_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
--  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst3q_p16 (poly16_t * __a, poly16x8x3_t val)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp1_u32 (uint32x2_t __a, uint32x2_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
--  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst3q_s32 (int32_t * __a, int32x4x3_t val)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp1q_f16 (float16x8_t __a, float16x8_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
--  __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst3q_s64 (int64_t * __a, int64x2x3_t val)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp1q_f32 (float32x4_t __a, float32x4_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
--  __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst3q_u8 (uint8_t * __a, uint8x16x3_t val)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp1q_f64 (float64x2_t __a, float64x2_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
--  __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst3q_u16 (uint16_t * __a, uint16x8x3_t val)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp1q_p8 (poly8x16_t __a, poly8x16_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
--  __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint8x16_t)
-+      {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint8x16_t)
-+      {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst3q_u32 (uint32_t * __a, uint32x4x3_t val)
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp1q_p16 (poly16x8_t __a, poly16x8_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
--  __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst3q_u64 (uint64_t * __a, uint64x2x3_t val)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp1q_s8 (int8x16_t __a, int8x16_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
--  __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x16_t) {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15});
-+#else
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x16_t) {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst3q_f16 (float16_t * __a, float16x8x3_t val)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp1q_s16 (int16x8_t __a, int16x8_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[2], 2);
--  __builtin_aarch64_st3v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst3q_f32 (float32_t * __a, float32x4x3_t val)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp1q_s32 (int32x4_t __a, int32x4_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[2], 2);
--  __builtin_aarch64_st3v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst3q_f64 (float64_t * __a, float64x2x3_t val)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp1q_s64 (int64x2_t __a, int64x2_t __b)
- {
--  __builtin_aarch64_simd_ci __o;
--  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[2], 2);
--  __builtin_aarch64_st3v2df ((__builtin_aarch64_simd_df *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
-+#endif
- }
- 
--__extension__ static __inline void
--vst4_s64 (int64_t * __a, int64x1x4_t val)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp1q_u8 (uint8x16_t __a, uint8x16_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  int64x2x4_t temp;
--  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
--  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
--  temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
--  temp.val[3] = vcombine_s64 (val.val[3], vcreate_s64 (__AARCH64_INT64_C (0)));
--  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[3], 3);
--  __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x16_t) {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15});
-+#else
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x16_t) {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
-+#endif
- }
- 
--__extension__ static __inline void
--vst4_u64 (uint64_t * __a, uint64x1x4_t val)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp1q_u16 (uint16x8_t __a, uint16x8_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  uint64x2x4_t temp;
--  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
--  temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
--  temp.val[3] = vcombine_u64 (val.val[3], vcreate_u64 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[3], 3);
--  __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
-+#endif
- }
- 
--__extension__ static __inline void
--vst4_f64 (float64_t * __a, float64x1x4_t val)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp1q_u32 (uint32x4_t __a, uint32x4_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  float64x2x4_t temp;
--  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
--  temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
--  temp.val[3] = vcombine_f64 (val.val[3], vcreate_f64 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[3], 3);
--  __builtin_aarch64_st4df ((__builtin_aarch64_simd_df *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6});
-+#endif
- }
- 
--__extension__ static __inline void
--vst4_s8 (int8_t * __a, int8x8x4_t val)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp1q_u64 (uint64x2_t __a, uint64x2_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  int8x16x4_t temp;
--  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
--  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
--  temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
--  temp.val[3] = vcombine_s8 (val.val[3], vcreate_s8 (__AARCH64_INT64_C (0)));
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3);
--  __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
-+#endif
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp2_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst4_p8 (poly8_t * __a, poly8x8x4_t val)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp2_f32 (float32x2_t __a, float32x2_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  poly8x16x4_t temp;
--  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
--  temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
--  temp.val[3] = vcombine_p8 (val.val[3], vcreate_p8 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3);
--  __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst4_s16 (int16_t * __a, int16x4x4_t val)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp2_p8 (poly8x8_t __a, poly8x8_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  int16x8x4_t temp;
--  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
--  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
--  temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
--  temp.val[3] = vcombine_s16 (val.val[3], vcreate_s16 (__AARCH64_INT64_C (0)));
--  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3);
--  __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst4_p16 (poly16_t * __a, poly16x4x4_t val)
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp2_p16 (poly16x4_t __a, poly16x4_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  poly16x8x4_t temp;
--  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
--  temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
--  temp.val[3] = vcombine_p16 (val.val[3], vcreate_p16 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3);
--  __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst4_s32 (int32_t * __a, int32x2x4_t val)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp2_s8 (int8x8_t __a, int8x8_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  int32x4x4_t temp;
--  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
--  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
--  temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
--  temp.val[3] = vcombine_s32 (val.val[3], vcreate_s32 (__AARCH64_INT64_C (0)));
--  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[3], 3);
--  __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst4_u8 (uint8_t * __a, uint8x8x4_t val)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp2_s16 (int16x4_t __a, int16x4_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  uint8x16x4_t temp;
--  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
--  temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
--  temp.val[3] = vcombine_u8 (val.val[3], vcreate_u8 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3);
--  __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst4_u16 (uint16_t * __a, uint16x4x4_t val)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp2_s32 (int32x2_t __a, int32x2_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  uint16x8x4_t temp;
--  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
--  temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
--  temp.val[3] = vcombine_u16 (val.val[3], vcreate_u16 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3);
--  __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst4_u32 (uint32_t * __a, uint32x2x4_t val)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp2_u8 (uint8x8_t __a, uint8x8_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  uint32x4x4_t temp;
--  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
--  temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
--  temp.val[3] = vcombine_u32 (val.val[3], vcreate_u32 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[3], 3);
--  __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst4_f16 (float16_t * __a, float16x4x4_t val)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp2_u16 (uint16x4_t __a, uint16x4_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  float16x8x4_t temp;
--  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
--  temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
--  temp.val[3] = vcombine_f16 (val.val[3], vcreate_f16 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[3], 3);
--  __builtin_aarch64_st4v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst4_f32 (float32_t * __a, float32x2x4_t val)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp2_u32 (uint32x2_t __a, uint32x2_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  float32x4x4_t temp;
--  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
--  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
--  temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
--  temp.val[3] = vcombine_f32 (val.val[3], vcreate_f32 (__AARCH64_UINT64_C (0)));
--  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[3], 3);
--  __builtin_aarch64_st4v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst4q_s8 (int8_t * __a, int8x16x4_t val)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp2q_f16 (float16x8_t __a, float16x8_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
--  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst4q_p8 (poly8_t * __a, poly8x16x4_t val)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp2q_f32 (float32x4_t __a, float32x4_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
--  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst4q_s16 (int16_t * __a, int16x8x4_t val)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp2q_f64 (float64x2_t __a, float64x2_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
--  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst4q_p16 (poly16_t * __a, poly16x8x4_t val)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp2q_p8 (poly8x16_t __a, poly8x16_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
--  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x16_t) {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14});
-+#else
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x16_t) {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst4q_s32 (int32_t * __a, int32x4x4_t val)
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp2q_p16 (poly16x8_t __a, poly16x8_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3);
--  __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst4q_s64 (int64_t * __a, int64x2x4_t val)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp2q_s8 (int8x16_t __a, int8x16_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3);
--  __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x16_t) {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14});
-+#else
-+  return __builtin_shuffle (__a, __b,
-+      (uint8x16_t) {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst4q_u8 (uint8_t * __a, uint8x16x4_t val)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp2q_s16 (int16x8_t __a, int16x8_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
--  __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst4q_u16 (uint16_t * __a, uint16x8x4_t val)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp2q_s32 (int32x4_t __a, int32x4_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
--  __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst4q_u32 (uint32_t * __a, uint32x4x4_t val)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp2q_s64 (int64x2_t __a, int64x2_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3);
--  __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst4q_u64 (uint64_t * __a, uint64x2x4_t val)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp2q_u8 (uint8x16_t __a, uint8x16_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3);
--  __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint8x16_t)
-+      {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint8x16_t)
-+      {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst4q_f16 (float16_t * __a, float16x8x4_t val)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp2q_u16 (uint16x8_t __a, uint16x8_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[3], 3);
--  __builtin_aarch64_st4v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst4q_f32 (float32_t * __a, float32x4x4_t val)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp2q_u32 (uint32x4_t __a, uint32x4_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[3], 3);
--  __builtin_aarch64_st4v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7});
-+#endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vst4q_f64 (float64_t * __a, float64x2x4_t val)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp2q_u64 (uint64x2_t __a, uint64x2_t __b)
- {
--  __builtin_aarch64_simd_xi __o;
--  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[0], 0);
--  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[1], 1);
--  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[2], 2);
--  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[3], 3);
--  __builtin_aarch64_st4v2df ((__builtin_aarch64_simd_df *) __a, __o);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
-+#endif
- }
- 
--/* vsub */
-+__INTERLEAVE_LIST (uzp)
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vsubd_s64 (int64_t __a, int64_t __b)
-+/* vzip */
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip1_f16 (float16x4_t __a, float16x4_t __b)
- {
--  return __a - __b;
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5});
-+#endif
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vsubd_u64 (uint64_t __a, uint64_t __b)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip1_f32 (float32x2_t __a, float32x2_t __b)
- {
--  return __a - __b;
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
-+#endif
- }
- 
--/* vtbx1  */
--
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vtbx1_s8 (int8x8_t __r, int8x8_t __tab, int8x8_t __idx)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip1_p8 (poly8x8_t __a, poly8x8_t __b)
- {
--  uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx),
--			      vmov_n_u8 (8));
--  int8x8_t __tbl = vtbl1_s8 (__tab, __idx);
--
--  return vbsl_s8 (__mask, __tbl, __r);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {12, 4, 13, 5, 14, 6, 15, 7});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
-+#endif
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vtbx1_u8 (uint8x8_t __r, uint8x8_t __tab, uint8x8_t __idx)
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip1_p16 (poly16x4_t __a, poly16x4_t __b)
- {
--  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8));
--  uint8x8_t __tbl = vtbl1_u8 (__tab, __idx);
--
--  return vbsl_u8 (__mask, __tbl, __r);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5});
-+#endif
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vtbx1_p8 (poly8x8_t __r, poly8x8_t __tab, uint8x8_t __idx)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip1_s8 (int8x8_t __a, int8x8_t __b)
- {
--  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8));
--  poly8x8_t __tbl = vtbl1_p8 (__tab, __idx);
--
--  return vbsl_p8 (__mask, __tbl, __r);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {12, 4, 13, 5, 14, 6, 15, 7});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
-+#endif
- }
- 
--/* vtbx3  */
--
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vtbx3_s8 (int8x8_t __r, int8x8x3_t __tab, int8x8_t __idx)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip1_s16 (int16x4_t __a, int16x4_t __b)
- {
--  uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx),
--			      vmov_n_u8 (24));
--  int8x8_t __tbl = vtbl3_s8 (__tab, __idx);
--
--  return vbsl_s8 (__mask, __tbl, __r);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5});
-+#endif
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vtbx3_u8 (uint8x8_t __r, uint8x8x3_t __tab, uint8x8_t __idx)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip1_s32 (int32x2_t __a, int32x2_t __b)
- {
--  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24));
--  uint8x8_t __tbl = vtbl3_u8 (__tab, __idx);
--
--  return vbsl_u8 (__mask, __tbl, __r);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
-+#endif
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip1_u8 (uint8x8_t __a, uint8x8_t __b)
- {
--  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24));
--  poly8x8_t __tbl = vtbl3_p8 (__tab, __idx);
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {12, 4, 13, 5, 14, 6, 15, 7});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
-+#endif
-+}
- 
--  return vbsl_p8 (__mask, __tbl, __r);
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip1_u16 (uint16x4_t __a, uint16x4_t __b)
-+{
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5});
-+#endif
- }
- 
--/* vtbx4  */
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip1_u32 (uint32x2_t __a, uint32x2_t __b)
-+{
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
-+#endif
-+}
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip1q_f16 (float16x8_t __a, float16x8_t __b)
- {
--  int8x8_t result;
--  int8x16x2_t temp;
--  __builtin_aarch64_simd_oi __o;
--  temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]);
--  temp.val[1] = vcombine_s8 (__tab.val[2], __tab.val[3]);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o,
--					   (int8x16_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o,
--					   (int8x16_t) temp.val[1], 1);
--  result = __builtin_aarch64_tbx4v8qi (__r, __o, __idx);
--  return result;
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b,
-+			    (uint16x8_t) {12, 4, 13, 5, 14, 6, 15, 7});
-+#else
-+  return __builtin_shuffle (__a, __b,
-+			    (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
-+#endif
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip1q_f32 (float32x4_t __a, float32x4_t __b)
- {
--  uint8x8_t result;
--  uint8x16x2_t temp;
--  __builtin_aarch64_simd_oi __o;
--  temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]);
--  temp.val[1] = vcombine_u8 (__tab.val[2], __tab.val[3]);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o,
--					   (int8x16_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o,
--					   (int8x16_t) temp.val[1], 1);
--  result = (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
--						  (int8x8_t)__idx);
--  return result;
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {6, 2, 7, 3});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 1, 5});
-+#endif
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip1q_f64 (float64x2_t __a, float64x2_t __b)
- {
--  poly8x8_t result;
--  poly8x16x2_t temp;
--  __builtin_aarch64_simd_oi __o;
--  temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]);
--  temp.val[1] = vcombine_p8 (__tab.val[2], __tab.val[3]);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o,
--					   (int8x16_t) temp.val[0], 0);
--  __o = __builtin_aarch64_set_qregoiv16qi (__o,
--					   (int8x16_t) temp.val[1], 1);
--  result = (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
--						  (int8x8_t)__idx);
--  return result;
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
-+#endif
- }
- 
--/* vtrn */
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip1q_p8 (poly8x16_t __a, poly8x16_t __b)
-+{
-+#ifdef __AARCH64EB__
-+  return __builtin_shuffle (__a, __b, (uint8x16_t)
-+      {24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15});
-+#else
-+  return __builtin_shuffle (__a, __b, (uint8x16_t)
-+      {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23});
-+#endif
-+}
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vtrn1_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip1q_p16 (poly16x8_t __a, poly16x8_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
-+  return __builtin_shuffle (__a, __b, (uint16x8_t)
-+      {12, 4, 13, 5, 14, 6, 15, 7});
- #else
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
- #endif
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vtrn1_p8 (poly8x8_t __a, poly8x8_t __b)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip1q_s8 (int8x16_t __a, int8x16_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
-+  return __builtin_shuffle (__a, __b, (uint8x16_t)
-+      {24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15});
- #else
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
-+  return __builtin_shuffle (__a, __b, (uint8x16_t)
-+      {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23});
- #endif
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
--vtrn1_p16 (poly16x4_t __a, poly16x4_t __b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip1q_s16 (int16x8_t __a, int16x8_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3});
-+  return __builtin_shuffle (__a, __b, (uint16x8_t)
-+      {12, 4, 13, 5, 14, 6, 15, 7});
- #else
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6});
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
- #endif
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vtrn1_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip1q_s32 (int32x4_t __a, int32x4_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {6, 2, 7, 3});
- #else
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 1, 5});
- #endif
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vtrn1_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip1q_s64 (int64x2_t __a, int64x2_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3});
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
- #else
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6});
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
- #endif
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vtrn1_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip1q_u8 (uint8x16_t __a, uint8x16_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
-+  return __builtin_shuffle (__a, __b, (uint8x16_t)
-+      {24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15});
- #else
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
-+  return __builtin_shuffle (__a, __b, (uint8x16_t)
-+      {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23});
- #endif
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vtrn1_u8 (uint8x8_t __a, uint8x8_t __b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip1q_u16 (uint16x8_t __a, uint16x8_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
-+  return __builtin_shuffle (__a, __b, (uint16x8_t)
-+      {12, 4, 13, 5, 14, 6, 15, 7});
- #else
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
- #endif
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vtrn1_u16 (uint16x4_t __a, uint16x4_t __b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip1q_u32 (uint32x4_t __a, uint32x4_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3});
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {6, 2, 7, 3});
- #else
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6});
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 1, 5});
- #endif
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vtrn1_u32 (uint32x2_t __a, uint32x2_t __b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip1q_u64 (uint64x2_t __a, uint64x2_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
- #else
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
- #endif
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vtrn1q_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip2_f16 (float16x4_t __a, float16x4_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 1, 7, 3});
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1});
- #else
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 2, 6});
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7});
- #endif
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vtrn1q_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline float32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip2_f32 (float32x2_t __a, float32x2_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
- #else
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
- #endif
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vtrn1q_p8 (poly8x16_t __a, poly8x16_t __b)
-+__extension__ extern __inline poly8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip2_p8 (poly8x8_t __a, poly8x8_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b,
--      (uint8x16_t) {17, 1, 19, 3, 21, 5, 23, 7, 25, 9, 27, 11, 29, 13, 31, 15});
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
- #else
--  return __builtin_shuffle (__a, __b,
--      (uint8x16_t) {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30});
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {4, 12, 5, 13, 6, 14, 7, 15});
- #endif
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vtrn1q_p16 (poly16x8_t __a, poly16x8_t __b)
-+__extension__ extern __inline poly16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip2_p16 (poly16x4_t __a, poly16x4_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1});
- #else
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7});
- #endif
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vtrn1q_s8 (int8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline int8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip2_s8 (int8x8_t __a, int8x8_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b,
--      (uint8x16_t) {17, 1, 19, 3, 21, 5, 23, 7, 25, 9, 27, 11, 29, 13, 31, 15});
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
- #else
--  return __builtin_shuffle (__a, __b,
--      (uint8x16_t) {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30});
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {4, 12, 5, 13, 6, 14, 7, 15});
- #endif
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vtrn1q_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip2_s16 (int16x4_t __a, int16x4_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1});
- #else
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7});
- #endif
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vtrn1q_s32 (int32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline int32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip2_s32 (int32x2_t __a, int32x2_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 1, 7, 3});
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
- #else
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 2, 6});
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
- #endif
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vtrn1q_s64 (int64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline uint8x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip2_u8 (uint8x8_t __a, uint8x8_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
- #else
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
-+  return __builtin_shuffle (__a, __b, (uint8x8_t) {4, 12, 5, 13, 6, 14, 7, 15});
- #endif
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vtrn1q_u8 (uint8x16_t __a, uint8x16_t __b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip2_u16 (uint16x4_t __a, uint16x4_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b,
--      (uint8x16_t) {17, 1, 19, 3, 21, 5, 23, 7, 25, 9, 27, 11, 29, 13, 31, 15});
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1});
- #else
--  return __builtin_shuffle (__a, __b,
--      (uint8x16_t) {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30});
-+  return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7});
- #endif
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vtrn1q_u16 (uint16x8_t __a, uint16x8_t __b)
-+__extension__ extern __inline uint32x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip2_u32 (uint32x2_t __a, uint32x2_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
- #else
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
-+  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
- #endif
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vtrn1q_u32 (uint32x4_t __a, uint32x4_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip2q_f16 (float16x8_t __a, float16x8_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 1, 7, 3});
-+  return __builtin_shuffle (__a, __b,
-+			    (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
- #else
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 2, 6});
-+  return __builtin_shuffle (__a, __b,
-+			    (uint16x8_t) {4, 12, 5, 13, 6, 14, 7, 15});
- #endif
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vtrn1q_u64 (uint64x2_t __a, uint64x2_t __b)
-+__extension__ extern __inline float32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip2q_f32 (float32x4_t __a, float32x4_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 5, 1});
- #else
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {2, 6, 3, 7});
- #endif
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vtrn2_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline float64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip2q_f64 (float64x2_t __a, float64x2_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
- #else
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
- #endif
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vtrn2_p8 (poly8x8_t __a, poly8x8_t __b)
-+__extension__ extern __inline poly8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip2q_p8 (poly8x16_t __a, poly8x16_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
-+  return __builtin_shuffle (__a, __b, (uint8x16_t)
-+      {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7});
- #else
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
-+  return __builtin_shuffle (__a, __b, (uint8x16_t)
-+      {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
- #endif
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
--vtrn2_p16 (poly16x4_t __a, poly16x4_t __b)
-+__extension__ extern __inline poly16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip2q_p16 (poly16x8_t __a, poly16x8_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2});
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
- #else
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7});
-+  return __builtin_shuffle (__a, __b, (uint16x8_t)
-+      {4, 12, 5, 13, 6, 14, 7, 15});
- #endif
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vtrn2_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline int8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip2q_s8 (int8x16_t __a, int8x16_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
-+  return __builtin_shuffle (__a, __b, (uint8x16_t)
-+      {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7});
- #else
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
-+  return __builtin_shuffle (__a, __b, (uint8x16_t)
-+      {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
- #endif
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vtrn2_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip2q_s16 (int16x8_t __a, int16x8_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2});
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
- #else
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7});
-+  return __builtin_shuffle (__a, __b, (uint16x8_t)
-+      {4, 12, 5, 13, 6, 14, 7, 15});
- #endif
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vtrn2_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline int32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip2q_s32 (int32x4_t __a, int32x4_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 5, 1});
- #else
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {2, 6, 3, 7});
- #endif
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vtrn2_u8 (uint8x8_t __a, uint8x8_t __b)
-+__extension__ extern __inline int64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip2q_s64 (int64x2_t __a, int64x2_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
- #else
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
-+  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
- #endif
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vtrn2_u16 (uint16x4_t __a, uint16x4_t __b)
-+__extension__ extern __inline uint8x16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip2q_u8 (uint8x16_t __a, uint8x16_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2});
-+  return __builtin_shuffle (__a, __b, (uint8x16_t)
-+      {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7});
- #else
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7});
-+  return __builtin_shuffle (__a, __b, (uint8x16_t)
-+      {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
- #endif
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vtrn2_u32 (uint32x2_t __a, uint32x2_t __b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip2q_u16 (uint16x8_t __a, uint16x8_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
-+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
- #else
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
-+  return __builtin_shuffle (__a, __b, (uint16x8_t)
-+      {4, 12, 5, 13, 6, 14, 7, 15});
- #endif
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vtrn2q_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline uint32x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip2q_u32 (uint32x4_t __a, uint32x4_t __b)
- {
- #ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 6, 2});
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 5, 1});
- #else
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 5, 3, 7});
-+  return __builtin_shuffle (__a, __b, (uint32x4_t) {2, 6, 3, 7});
- #endif
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vtrn2q_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline uint64x2_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip2q_u64 (uint64x2_t __a, uint64x2_t __b)
- {
- #ifdef __AARCH64EB__
-   return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
-@@ -24455,1319 +30368,1184 @@ vtrn2q_f64 (float64x2_t __a, float64x2_t __b)
- #endif
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vtrn2q_p8 (poly8x16_t __a, poly8x16_t __b)
-+__INTERLEAVE_LIST (zip)
-+
-+#undef __INTERLEAVE_LIST
-+#undef __DEFINTERLEAVE
-+
-+/* End of optimal implementations in approved order.  */
-+
-+#pragma GCC pop_options
-+
-+/* ARMv8.2-A FP16 intrinsics.  */
-+
-+#include "arm_fp16.h"
-+
-+#pragma GCC push_options
-+#pragma GCC target ("arch=armv8.2-a+fp16")
-+
-+/* ARMv8.2-A FP16 one operand vector intrinsics.  */
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vabs_f16 (float16x4_t __a)
-+{
-+  return __builtin_aarch64_absv4hf (__a);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vabsq_f16 (float16x8_t __a)
-+{
-+  return __builtin_aarch64_absv8hf (__a);
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqz_f16 (float16x4_t __a)
-+{
-+  return __builtin_aarch64_cmeqv4hf_uss (__a, vdup_n_f16 (0.0f));
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzq_f16 (float16x8_t __a)
-+{
-+  return __builtin_aarch64_cmeqv8hf_uss (__a, vdupq_n_f16 (0.0f));
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgez_f16 (float16x4_t __a)
-+{
-+  return __builtin_aarch64_cmgev4hf_uss (__a, vdup_n_f16 (0.0f));
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgezq_f16 (float16x8_t __a)
-+{
-+  return __builtin_aarch64_cmgev8hf_uss (__a, vdupq_n_f16 (0.0f));
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtz_f16 (float16x4_t __a)
-+{
-+  return __builtin_aarch64_cmgtv4hf_uss (__a, vdup_n_f16 (0.0f));
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtzq_f16 (float16x8_t __a)
-+{
-+  return __builtin_aarch64_cmgtv8hf_uss (__a, vdupq_n_f16 (0.0f));
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclez_f16 (float16x4_t __a)
-+{
-+  return __builtin_aarch64_cmlev4hf_uss (__a, vdup_n_f16 (0.0f));
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclezq_f16 (float16x8_t __a)
-+{
-+  return __builtin_aarch64_cmlev8hf_uss (__a, vdupq_n_f16 (0.0f));
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltz_f16 (float16x4_t __a)
-+{
-+  return __builtin_aarch64_cmltv4hf_uss (__a, vdup_n_f16 (0.0f));
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltzq_f16 (float16x8_t __a)
-+{
-+  return __builtin_aarch64_cmltv8hf_uss (__a, vdupq_n_f16 (0.0f));
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_f16_s16 (int16x4_t __a)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b,
--      (uint8x16_t) {16, 0, 18, 2, 20, 4, 22, 6, 24, 8, 26, 10, 28, 12, 30, 14});
--#else
--  return __builtin_shuffle (__a, __b,
--      (uint8x16_t) {1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31});
--#endif
-+  return __builtin_aarch64_floatv4hiv4hf (__a);
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vtrn2q_p16 (poly16x8_t __a, poly16x8_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_f16_s16 (int16x8_t __a)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
--#endif
-+  return __builtin_aarch64_floatv8hiv8hf (__a);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vtrn2q_s8 (int8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_f16_u16 (uint16x4_t __a)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b,
--      (uint8x16_t) {16, 0, 18, 2, 20, 4, 22, 6, 24, 8, 26, 10, 28, 12, 30, 14});
--#else
--  return __builtin_shuffle (__a, __b,
--      (uint8x16_t) {1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31});
--#endif
-+  return __builtin_aarch64_floatunsv4hiv4hf ((int16x4_t) __a);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vtrn2q_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_f16_u16 (uint16x8_t __a)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
--#endif
-+  return __builtin_aarch64_floatunsv8hiv8hf ((int16x8_t) __a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vtrn2q_s32 (int32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_s16_f16 (float16x4_t __a)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 6, 2});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 5, 3, 7});
--#endif
-+  return __builtin_aarch64_lbtruncv4hfv4hi (__a);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vtrn2q_s64 (int64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_s16_f16 (float16x8_t __a)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
--#else
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
--#endif
-+  return __builtin_aarch64_lbtruncv8hfv8hi (__a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vtrn2q_u8 (uint8x16_t __a, uint8x16_t __b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_u16_f16 (float16x4_t __a)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b,
--      (uint8x16_t) {16, 0, 18, 2, 20, 4, 22, 6, 24, 8, 26, 10, 28, 12, 30, 14});
--#else
--  return __builtin_shuffle (__a, __b,
--      (uint8x16_t) {1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31});
--#endif
-+  return __builtin_aarch64_lbtruncuv4hfv4hi_us (__a);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vtrn2q_u16 (uint16x8_t __a, uint16x8_t __b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_u16_f16 (float16x8_t __a)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
--#endif
-+  return __builtin_aarch64_lbtruncuv8hfv8hi_us (__a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vtrn2q_u32 (uint32x4_t __a, uint32x4_t __b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvta_s16_f16 (float16x4_t __a)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 6, 2});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 5, 3, 7});
--#endif
-+  return __builtin_aarch64_lroundv4hfv4hi (__a);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vtrn2q_u64 (uint64x2_t __a, uint64x2_t __b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtaq_s16_f16 (float16x8_t __a)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
--#else
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
--#endif
-+  return __builtin_aarch64_lroundv8hfv8hi (__a);
- }
- 
--__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
--vtrn_f32 (float32x2_t a, float32x2_t b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvta_u16_f16 (float16x4_t __a)
- {
--  return (float32x2x2_t) {vtrn1_f32 (a, b), vtrn2_f32 (a, b)};
-+  return __builtin_aarch64_lrounduv4hfv4hi_us (__a);
- }
- 
--__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
--vtrn_p8 (poly8x8_t a, poly8x8_t b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtaq_u16_f16 (float16x8_t __a)
- {
--  return (poly8x8x2_t) {vtrn1_p8 (a, b), vtrn2_p8 (a, b)};
-+  return __builtin_aarch64_lrounduv8hfv8hi_us (__a);
- }
- 
--__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
--vtrn_p16 (poly16x4_t a, poly16x4_t b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtm_s16_f16 (float16x4_t __a)
- {
--  return (poly16x4x2_t) {vtrn1_p16 (a, b), vtrn2_p16 (a, b)};
-+  return __builtin_aarch64_lfloorv4hfv4hi (__a);
- }
- 
--__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
--vtrn_s8 (int8x8_t a, int8x8_t b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtmq_s16_f16 (float16x8_t __a)
- {
--  return (int8x8x2_t) {vtrn1_s8 (a, b), vtrn2_s8 (a, b)};
-+  return __builtin_aarch64_lfloorv8hfv8hi (__a);
- }
- 
--__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
--vtrn_s16 (int16x4_t a, int16x4_t b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtm_u16_f16 (float16x4_t __a)
- {
--  return (int16x4x2_t) {vtrn1_s16 (a, b), vtrn2_s16 (a, b)};
-+  return __builtin_aarch64_lflooruv4hfv4hi_us (__a);
- }
- 
--__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
--vtrn_s32 (int32x2_t a, int32x2_t b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtmq_u16_f16 (float16x8_t __a)
- {
--  return (int32x2x2_t) {vtrn1_s32 (a, b), vtrn2_s32 (a, b)};
-+  return __builtin_aarch64_lflooruv8hfv8hi_us (__a);
- }
- 
--__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
--vtrn_u8 (uint8x8_t a, uint8x8_t b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtn_s16_f16 (float16x4_t __a)
- {
--  return (uint8x8x2_t) {vtrn1_u8 (a, b), vtrn2_u8 (a, b)};
-+  return __builtin_aarch64_lfrintnv4hfv4hi (__a);
- }
- 
--__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
--vtrn_u16 (uint16x4_t a, uint16x4_t b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtnq_s16_f16 (float16x8_t __a)
- {
--  return (uint16x4x2_t) {vtrn1_u16 (a, b), vtrn2_u16 (a, b)};
-+  return __builtin_aarch64_lfrintnv8hfv8hi (__a);
- }
- 
--__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
--vtrn_u32 (uint32x2_t a, uint32x2_t b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtn_u16_f16 (float16x4_t __a)
- {
--  return (uint32x2x2_t) {vtrn1_u32 (a, b), vtrn2_u32 (a, b)};
-+  return __builtin_aarch64_lfrintnuv4hfv4hi_us (__a);
- }
- 
--__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
--vtrnq_f32 (float32x4_t a, float32x4_t b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtnq_u16_f16 (float16x8_t __a)
- {
--  return (float32x4x2_t) {vtrn1q_f32 (a, b), vtrn2q_f32 (a, b)};
-+  return __builtin_aarch64_lfrintnuv8hfv8hi_us (__a);
- }
- 
--__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
--vtrnq_p8 (poly8x16_t a, poly8x16_t b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtp_s16_f16 (float16x4_t __a)
- {
--  return (poly8x16x2_t) {vtrn1q_p8 (a, b), vtrn2q_p8 (a, b)};
-+  return __builtin_aarch64_lceilv4hfv4hi (__a);
- }
- 
--__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
--vtrnq_p16 (poly16x8_t a, poly16x8_t b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtpq_s16_f16 (float16x8_t __a)
- {
--  return (poly16x8x2_t) {vtrn1q_p16 (a, b), vtrn2q_p16 (a, b)};
-+  return __builtin_aarch64_lceilv8hfv8hi (__a);
- }
- 
--__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
--vtrnq_s8 (int8x16_t a, int8x16_t b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtp_u16_f16 (float16x4_t __a)
- {
--  return (int8x16x2_t) {vtrn1q_s8 (a, b), vtrn2q_s8 (a, b)};
-+  return __builtin_aarch64_lceiluv4hfv4hi_us (__a);
- }
- 
--__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
--vtrnq_s16 (int16x8_t a, int16x8_t b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtpq_u16_f16 (float16x8_t __a)
- {
--  return (int16x8x2_t) {vtrn1q_s16 (a, b), vtrn2q_s16 (a, b)};
-+  return __builtin_aarch64_lceiluv8hfv8hi_us (__a);
- }
- 
--__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
--vtrnq_s32 (int32x4_t a, int32x4_t b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vneg_f16 (float16x4_t __a)
- {
--  return (int32x4x2_t) {vtrn1q_s32 (a, b), vtrn2q_s32 (a, b)};
-+  return -__a;
- }
- 
--__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
--vtrnq_u8 (uint8x16_t a, uint8x16_t b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vnegq_f16 (float16x8_t __a)
- {
--  return (uint8x16x2_t) {vtrn1q_u8 (a, b), vtrn2q_u8 (a, b)};
-+  return -__a;
- }
- 
--__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
--vtrnq_u16 (uint16x8_t a, uint16x8_t b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrecpe_f16 (float16x4_t __a)
- {
--  return (uint16x8x2_t) {vtrn1q_u16 (a, b), vtrn2q_u16 (a, b)};
-+  return __builtin_aarch64_frecpev4hf (__a);
- }
- 
--__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
--vtrnq_u32 (uint32x4_t a, uint32x4_t b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrecpeq_f16 (float16x8_t __a)
- {
--  return (uint32x4x2_t) {vtrn1q_u32 (a, b), vtrn2q_u32 (a, b)};
-+  return __builtin_aarch64_frecpev8hf (__a);
- }
- 
--/* vtst */
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrnd_f16 (float16x4_t __a)
-+{
-+  return __builtin_aarch64_btruncv4hf (__a);
-+}
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vtst_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndq_f16 (float16x8_t __a)
- {
--  return (uint8x8_t) ((__a & __b) != 0);
-+  return __builtin_aarch64_btruncv8hf (__a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vtst_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrnda_f16 (float16x4_t __a)
- {
--  return (uint16x4_t) ((__a & __b) != 0);
-+  return __builtin_aarch64_roundv4hf (__a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vtst_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndaq_f16 (float16x8_t __a)
- {
--  return (uint32x2_t) ((__a & __b) != 0);
-+  return __builtin_aarch64_roundv8hf (__a);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vtst_s64 (int64x1_t __a, int64x1_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndi_f16 (float16x4_t __a)
- {
--  return (uint64x1_t) ((__a & __b) != __AARCH64_INT64_C (0));
-+  return __builtin_aarch64_nearbyintv4hf (__a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vtst_u8 (uint8x8_t __a, uint8x8_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndiq_f16 (float16x8_t __a)
- {
--  return ((__a & __b) != 0);
-+  return __builtin_aarch64_nearbyintv8hf (__a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vtst_u16 (uint16x4_t __a, uint16x4_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndm_f16 (float16x4_t __a)
- {
--  return ((__a & __b) != 0);
-+  return __builtin_aarch64_floorv4hf (__a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vtst_u32 (uint32x2_t __a, uint32x2_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndmq_f16 (float16x8_t __a)
- {
--  return ((__a & __b) != 0);
-+  return __builtin_aarch64_floorv8hf (__a);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vtst_u64 (uint64x1_t __a, uint64x1_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndn_f16 (float16x4_t __a)
- {
--  return ((__a & __b) != __AARCH64_UINT64_C (0));
-+  return __builtin_aarch64_frintnv4hf (__a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vtstq_s8 (int8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndnq_f16 (float16x8_t __a)
- {
--  return (uint8x16_t) ((__a & __b) != 0);
-+  return __builtin_aarch64_frintnv8hf (__a);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vtstq_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndp_f16 (float16x4_t __a)
- {
--  return (uint16x8_t) ((__a & __b) != 0);
-+  return __builtin_aarch64_ceilv4hf (__a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vtstq_s32 (int32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndpq_f16 (float16x8_t __a)
- {
--  return (uint32x4_t) ((__a & __b) != 0);
-+  return __builtin_aarch64_ceilv8hf (__a);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vtstq_s64 (int64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndx_f16 (float16x4_t __a)
- {
--  return (uint64x2_t) ((__a & __b) != __AARCH64_INT64_C (0));
-+  return __builtin_aarch64_rintv4hf (__a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vtstq_u8 (uint8x16_t __a, uint8x16_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndxq_f16 (float16x8_t __a)
- {
--  return ((__a & __b) != 0);
-+  return __builtin_aarch64_rintv8hf (__a);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vtstq_u16 (uint16x8_t __a, uint16x8_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsqrte_f16 (float16x4_t a)
- {
--  return ((__a & __b) != 0);
-+  return __builtin_aarch64_rsqrtev4hf (a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vtstq_u32 (uint32x4_t __a, uint32x4_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsqrteq_f16 (float16x8_t a)
- {
--  return ((__a & __b) != 0);
-+  return __builtin_aarch64_rsqrtev8hf (a);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vtstq_u64 (uint64x2_t __a, uint64x2_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqrt_f16 (float16x4_t a)
- {
--  return ((__a & __b) != __AARCH64_UINT64_C (0));
-+  return __builtin_aarch64_sqrtv4hf (a);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vtstd_s64 (int64_t __a, int64_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsqrtq_f16 (float16x8_t a)
- {
--  return (__a & __b) ? -1ll : 0ll;
-+  return __builtin_aarch64_sqrtv8hf (a);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
--vtstd_u64 (uint64_t __a, uint64_t __b)
-+/* ARMv8.2-A FP16 two operands vector intrinsics.  */
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vadd_f16 (float16x4_t __a, float16x4_t __b)
- {
--  return (__a & __b) ? -1ll : 0ll;
-+  return __a + __b;
- }
- 
--/* vuqadd */
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vaddq_f16 (float16x8_t __a, float16x8_t __b)
-+{
-+  return __a + __b;
-+}
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vuqadd_s8 (int8x8_t __a, uint8x8_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vabd_f16 (float16x4_t a, float16x4_t b)
- {
--  return __builtin_aarch64_suqaddv8qi_ssu (__a,  __b);
-+  return __builtin_aarch64_fabdv4hf (a, b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vuqadd_s16 (int16x4_t __a, uint16x4_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vabdq_f16 (float16x8_t a, float16x8_t b)
- {
--  return __builtin_aarch64_suqaddv4hi_ssu (__a,  __b);
-+  return __builtin_aarch64_fabdv8hf (a, b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vuqadd_s32 (int32x2_t __a, uint32x2_t __b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcage_f16 (float16x4_t __a, float16x4_t __b)
- {
--  return __builtin_aarch64_suqaddv2si_ssu (__a,  __b);
-+  return __builtin_aarch64_facgev4hf_uss (__a, __b);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
--vuqadd_s64 (int64x1_t __a, uint64x1_t __b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcageq_f16 (float16x8_t __a, float16x8_t __b)
- {
--  return (int64x1_t) {__builtin_aarch64_suqadddi_ssu (__a[0], __b[0])};
-+  return __builtin_aarch64_facgev8hf_uss (__a, __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vuqaddq_s8 (int8x16_t __a, uint8x16_t __b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcagt_f16 (float16x4_t __a, float16x4_t __b)
- {
--  return __builtin_aarch64_suqaddv16qi_ssu (__a,  __b);
-+  return __builtin_aarch64_facgtv4hf_uss (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vuqaddq_s16 (int16x8_t __a, uint16x8_t __b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcagtq_f16 (float16x8_t __a, float16x8_t __b)
- {
--  return __builtin_aarch64_suqaddv8hi_ssu (__a,  __b);
-+  return __builtin_aarch64_facgtv8hf_uss (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vuqaddq_s32 (int32x4_t __a, uint32x4_t __b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcale_f16 (float16x4_t __a, float16x4_t __b)
- {
--  return __builtin_aarch64_suqaddv4si_ssu (__a,  __b);
-+  return __builtin_aarch64_faclev4hf_uss (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vuqaddq_s64 (int64x2_t __a, uint64x2_t __b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcaleq_f16 (float16x8_t __a, float16x8_t __b)
- {
--  return __builtin_aarch64_suqaddv2di_ssu (__a,  __b);
-+  return __builtin_aarch64_faclev8hf_uss (__a, __b);
- }
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
--vuqaddb_s8 (int8_t __a, uint8_t __b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcalt_f16 (float16x4_t __a, float16x4_t __b)
- {
--  return __builtin_aarch64_suqaddqi_ssu (__a,  __b);
-+  return __builtin_aarch64_facltv4hf_uss (__a, __b);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
--vuqaddh_s16 (int16_t __a, uint16_t __b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcaltq_f16 (float16x8_t __a, float16x8_t __b)
- {
--  return __builtin_aarch64_suqaddhi_ssu (__a,  __b);
-+  return __builtin_aarch64_facltv8hf_uss (__a, __b);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
--vuqadds_s32 (int32_t __a, uint32_t __b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_f16 (float16x4_t __a, float16x4_t __b)
- {
--  return __builtin_aarch64_suqaddsi_ssu (__a,  __b);
-+  return __builtin_aarch64_cmeqv4hf_uss (__a, __b);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
--vuqaddd_s64 (int64_t __a, uint64_t __b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqq_f16 (float16x8_t __a, float16x8_t __b)
- {
--  return __builtin_aarch64_suqadddi_ssu (__a,  __b);
-+  return __builtin_aarch64_cmeqv8hf_uss (__a, __b);
- }
- 
--#define __DEFINTERLEAVE(op, rettype, intype, funcsuffix, Q) 		\
--  __extension__ static __inline rettype					\
--  __attribute__ ((__always_inline__))					\
--  v ## op ## Q ## _ ## funcsuffix (intype a, intype b)			\
--  {									\
--    return (rettype) {v ## op ## 1 ## Q ## _ ## funcsuffix (a, b),	\
--		      v ## op ## 2 ## Q ## _ ## funcsuffix (a, b)};	\
--  }
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcge_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  return __builtin_aarch64_cmgev4hf_uss (__a, __b);
-+}
- 
--#define __INTERLEAVE_LIST(op)					\
--  __DEFINTERLEAVE (op, float32x2x2_t, float32x2_t, f32,)	\
--  __DEFINTERLEAVE (op, poly8x8x2_t, poly8x8_t, p8,)		\
--  __DEFINTERLEAVE (op, poly16x4x2_t, poly16x4_t, p16,)		\
--  __DEFINTERLEAVE (op, int8x8x2_t, int8x8_t, s8,)		\
--  __DEFINTERLEAVE (op, int16x4x2_t, int16x4_t, s16,)		\
--  __DEFINTERLEAVE (op, int32x2x2_t, int32x2_t, s32,)		\
--  __DEFINTERLEAVE (op, uint8x8x2_t, uint8x8_t, u8,)		\
--  __DEFINTERLEAVE (op, uint16x4x2_t, uint16x4_t, u16,)		\
--  __DEFINTERLEAVE (op, uint32x2x2_t, uint32x2_t, u32,)		\
--  __DEFINTERLEAVE (op, float32x4x2_t, float32x4_t, f32, q)	\
--  __DEFINTERLEAVE (op, poly8x16x2_t, poly8x16_t, p8, q)		\
--  __DEFINTERLEAVE (op, poly16x8x2_t, poly16x8_t, p16, q)	\
--  __DEFINTERLEAVE (op, int8x16x2_t, int8x16_t, s8, q)		\
--  __DEFINTERLEAVE (op, int16x8x2_t, int16x8_t, s16, q)		\
--  __DEFINTERLEAVE (op, int32x4x2_t, int32x4_t, s32, q)		\
--  __DEFINTERLEAVE (op, uint8x16x2_t, uint8x16_t, u8, q)		\
--  __DEFINTERLEAVE (op, uint16x8x2_t, uint16x8_t, u16, q)	\
--  __DEFINTERLEAVE (op, uint32x4x2_t, uint32x4_t, u32, q)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgeq_f16 (float16x8_t __a, float16x8_t __b)
-+{
-+  return __builtin_aarch64_cmgev8hf_uss (__a, __b);
-+}
- 
--/* vuzp */
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgt_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  return __builtin_aarch64_cmgtv4hf_uss (__a, __b);
-+}
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vuzp1_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtq_f16 (float16x8_t __a, float16x8_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
--#endif
-+  return __builtin_aarch64_cmgtv8hf_uss (__a, __b);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vuzp1_p8 (poly8x8_t __a, poly8x8_t __b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcle_f16 (float16x4_t __a, float16x4_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
--#else
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
--#endif
-+  return __builtin_aarch64_cmlev4hf_uss (__a, __b);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
--vuzp1_p16 (poly16x4_t __a, poly16x4_t __b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcleq_f16 (float16x8_t __a, float16x8_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6});
--#endif
-+  return __builtin_aarch64_cmlev8hf_uss (__a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vuzp1_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vclt_f16 (float16x4_t __a, float16x4_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
--#else
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
--#endif
-+  return __builtin_aarch64_cmltv4hf_uss (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vuzp1_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltq_f16 (float16x8_t __a, float16x8_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6});
--#endif
-+  return __builtin_aarch64_cmltv8hf_uss (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vuzp1_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_n_f16_s16 (int16x4_t __a, const int __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
--#endif
-+  return __builtin_aarch64_scvtfv4hi (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vuzp1_u8 (uint8x8_t __a, uint8x8_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_n_f16_s16 (int16x8_t __a, const int __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
--#else
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
--#endif
-+  return __builtin_aarch64_scvtfv8hi (__a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vuzp1_u16 (uint16x4_t __a, uint16x4_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_n_f16_u16 (uint16x4_t __a, const int __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6});
--#endif
-+  return __builtin_aarch64_ucvtfv4hi_sus (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_n_f16_u16 (uint16x8_t __a, const int __b)
-+{
-+  return __builtin_aarch64_ucvtfv8hi_sus (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vuzp1_u32 (uint32x2_t __a, uint32x2_t __b)
-+__extension__ extern __inline int16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_n_s16_f16 (float16x4_t __a, const int __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
--#endif
-+  return __builtin_aarch64_fcvtzsv4hf (__a, __b);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vuzp1q_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline int16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_n_s16_f16 (float16x8_t __a, const int __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6});
--#endif
-+  return __builtin_aarch64_fcvtzsv8hf (__a, __b);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vuzp1q_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline uint16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_n_u16_f16 (float16x4_t __a, const int __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
--#else
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
--#endif
-+  return __builtin_aarch64_fcvtzuv4hf_uss (__a, __b);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vuzp1q_p8 (poly8x16_t __a, poly8x16_t __b)
-+__extension__ extern __inline uint16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_n_u16_f16 (float16x8_t __a, const int __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x16_t)
--      {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15});
--#else
--  return __builtin_shuffle (__a, __b, (uint8x16_t)
--      {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
--#endif
-+  return __builtin_aarch64_fcvtzuv8hf_uss (__a, __b);
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vuzp1q_p16 (poly16x8_t __a, poly16x8_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdiv_f16 (float16x4_t __a, float16x4_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
--#endif
-+  return __a / __b;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vuzp1q_s8 (int8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vdivq_f16 (float16x8_t __a, float16x8_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b,
--      (uint8x16_t) {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15});
--#else
--  return __builtin_shuffle (__a, __b,
--      (uint8x16_t) {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
--#endif
-+  return __a / __b;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vuzp1q_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmax_f16 (float16x4_t __a, float16x4_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
--#endif
-+  return __builtin_aarch64_smax_nanv4hf (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vuzp1q_s32 (int32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxq_f16 (float16x8_t __a, float16x8_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6});
--#endif
-+  return __builtin_aarch64_smax_nanv8hf (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vuzp1q_s64 (int64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxnm_f16 (float16x4_t __a, float16x4_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
--#else
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
--#endif
-+  return __builtin_aarch64_fmaxv4hf (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vuzp1q_u8 (uint8x16_t __a, uint8x16_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxnmq_f16 (float16x8_t __a, float16x8_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b,
--      (uint8x16_t) {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15});
--#else
--  return __builtin_shuffle (__a, __b,
--      (uint8x16_t) {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
--#endif
-+  return __builtin_aarch64_fmaxv8hf (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vuzp1q_u16 (uint16x8_t __a, uint16x8_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmin_f16 (float16x4_t __a, float16x4_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
--#endif
-+  return __builtin_aarch64_smin_nanv4hf (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vuzp1q_u32 (uint32x4_t __a, uint32x4_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminq_f16 (float16x8_t __a, float16x8_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6});
--#endif
-+  return __builtin_aarch64_smin_nanv8hf (__a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vuzp1q_u64 (uint64x2_t __a, uint64x2_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminnm_f16 (float16x4_t __a, float16x4_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
--#else
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
--#endif
-+  return __builtin_aarch64_fminv4hf (__a, __b);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vuzp2_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminnmq_f16 (float16x8_t __a, float16x8_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
--#endif
-+  return __builtin_aarch64_fminv8hf (__a, __b);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vuzp2_p8 (poly8x8_t __a, poly8x8_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_f16 (float16x4_t __a, float16x4_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
--#else
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
--#endif
-+  return __a * __b;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
--vuzp2_p16 (poly16x4_t __a, poly16x4_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_f16 (float16x8_t __a, float16x8_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7});
--#endif
-+  return __a * __b;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vuzp2_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulx_f16 (float16x4_t __a, float16x4_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
--#else
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
--#endif
-+  return __builtin_aarch64_fmulxv4hf (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vuzp2_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxq_f16 (float16x8_t __a, float16x8_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7});
--#endif
-+  return __builtin_aarch64_fmulxv8hf (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vuzp2_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadd_f16 (float16x4_t a, float16x4_t b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
--#endif
-+  return __builtin_aarch64_faddpv4hf (a, b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vuzp2_u8 (uint8x8_t __a, uint8x8_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpaddq_f16 (float16x8_t a, float16x8_t b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
--#else
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
--#endif
-+  return __builtin_aarch64_faddpv8hf (a, b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vuzp2_u16 (uint16x4_t __a, uint16x4_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmax_f16 (float16x4_t a, float16x4_t b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7});
--#endif
-+  return __builtin_aarch64_smax_nanpv4hf (a, b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vuzp2_u32 (uint32x2_t __a, uint32x2_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxq_f16 (float16x8_t a, float16x8_t b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
--#endif
-+  return __builtin_aarch64_smax_nanpv8hf (a, b);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vuzp2q_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxnm_f16 (float16x4_t a, float16x4_t b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7});
--#endif
-+  return __builtin_aarch64_smaxpv4hf (a, b);
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vuzp2q_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmaxnmq_f16 (float16x8_t a, float16x8_t b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
--#else
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
--#endif
-+  return __builtin_aarch64_smaxpv8hf (a, b);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vuzp2q_p8 (poly8x16_t __a, poly8x16_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmin_f16 (float16x4_t a, float16x4_t b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b,
--      (uint8x16_t) {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14});
--#else
--  return __builtin_shuffle (__a, __b,
--      (uint8x16_t) {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31});
--#endif
-+  return __builtin_aarch64_smin_nanpv4hf (a, b);
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vuzp2q_p16 (poly16x8_t __a, poly16x8_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminq_f16 (float16x8_t a, float16x8_t b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
--#endif
-+  return __builtin_aarch64_smin_nanpv8hf (a, b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vuzp2q_s8 (int8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminnm_f16 (float16x4_t a, float16x4_t b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b,
--      (uint8x16_t) {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14});
--#else
--  return __builtin_shuffle (__a, __b,
--      (uint8x16_t) {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31});
--#endif
-+  return __builtin_aarch64_sminpv4hf (a, b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vuzp2q_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vpminnmq_f16 (float16x8_t a, float16x8_t b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
--#endif
-+  return __builtin_aarch64_sminpv8hf (a, b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vuzp2q_s32 (int32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrecps_f16 (float16x4_t __a, float16x4_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7});
--#endif
-+  return __builtin_aarch64_frecpsv4hf (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vuzp2q_s64 (int64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrecpsq_f16 (float16x8_t __a, float16x8_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
--#else
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
--#endif
-+  return __builtin_aarch64_frecpsv8hf (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vuzp2q_u8 (uint8x16_t __a, uint8x16_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsqrts_f16 (float16x4_t a, float16x4_t b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x16_t)
--      {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14});
--#else
--  return __builtin_shuffle (__a, __b, (uint8x16_t)
--      {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31});
--#endif
-+  return __builtin_aarch64_rsqrtsv4hf (a, b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vuzp2q_u16 (uint16x8_t __a, uint16x8_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsqrtsq_f16 (float16x8_t a, float16x8_t b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
--#endif
-+  return __builtin_aarch64_rsqrtsv8hf (a, b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vuzp2q_u32 (uint32x4_t __a, uint32x4_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsub_f16 (float16x4_t __a, float16x4_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7});
--#endif
-+  return __a - __b;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vuzp2q_u64 (uint64x2_t __a, uint64x2_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vsubq_f16 (float16x8_t __a, float16x8_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
--#else
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
--#endif
-+  return __a - __b;
- }
- 
--__INTERLEAVE_LIST (uzp)
--
--/* vzip */
-+/* ARMv8.2-A FP16 three operands vector intrinsics.  */
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vzip1_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfma_f16 (float16x4_t __a, float16x4_t __b, float16x4_t __c)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
--#endif
-+  return __builtin_aarch64_fmav4hf (__b, __c, __a);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vzip1_p8 (poly8x8_t __a, poly8x8_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmaq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {12, 4, 13, 5, 14, 6, 15, 7});
--#else
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
--#endif
-+  return __builtin_aarch64_fmav8hf (__b, __c, __a);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
--vzip1_p16 (poly16x4_t __a, poly16x4_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfms_f16 (float16x4_t __a, float16x4_t __b, float16x4_t __c)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5});
--#endif
-+  return __builtin_aarch64_fnmav4hf (__b, __c, __a);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vzip1_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmsq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {12, 4, 13, 5, 14, 6, 15, 7});
--#else
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
--#endif
-+  return __builtin_aarch64_fnmav8hf (__b, __c, __a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vzip1_s16 (int16x4_t __a, int16x4_t __b)
--{
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5});
--#endif
-+/* ARMv8.2-A FP16 lane vector intrinsics.  */
-+
-+__extension__ extern __inline float16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmah_lane_f16 (float16_t __a, float16_t __b,
-+		float16x4_t __c, const int __lane)
-+{
-+  return vfmah_f16 (__a, __b, __aarch64_vget_lane_any (__c, __lane));
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vzip1_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline float16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmah_laneq_f16 (float16_t __a, float16_t __b,
-+		 float16x8_t __c, const int __lane)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
--#endif
-+  return vfmah_f16 (__a, __b, __aarch64_vget_lane_any (__c, __lane));
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vzip1_u8 (uint8x8_t __a, uint8x8_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfma_lane_f16 (float16x4_t __a, float16x4_t __b,
-+	       float16x4_t __c, const int __lane)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {12, 4, 13, 5, 14, 6, 15, 7});
--#else
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
--#endif
-+  return vfma_f16 (__a, __b, __aarch64_vdup_lane_f16 (__c, __lane));
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vzip1_u16 (uint16x4_t __a, uint16x4_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmaq_lane_f16 (float16x8_t __a, float16x8_t __b,
-+		float16x4_t __c, const int __lane)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5});
--#endif
-+  return vfmaq_f16 (__a, __b, __aarch64_vdupq_lane_f16 (__c, __lane));
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vzip1_u32 (uint32x2_t __a, uint32x2_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfma_laneq_f16 (float16x4_t __a, float16x4_t __b,
-+		float16x8_t __c, const int __lane)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
--#endif
-+  return vfma_f16 (__a, __b, __aarch64_vdup_laneq_f16 (__c, __lane));
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vzip1q_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmaq_laneq_f16 (float16x8_t __a, float16x8_t __b,
-+		 float16x8_t __c, const int __lane)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {6, 2, 7, 3});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 1, 5});
--#endif
-+  return vfmaq_f16 (__a, __b, __aarch64_vdupq_laneq_f16 (__c, __lane));
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vzip1q_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfma_n_f16 (float16x4_t __a, float16x4_t __b, float16_t __c)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
--#else
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
--#endif
-+  return vfma_f16 (__a, __b, vdup_n_f16 (__c));
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vzip1q_p8 (poly8x16_t __a, poly8x16_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmaq_n_f16 (float16x8_t __a, float16x8_t __b, float16_t __c)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x16_t)
--      {24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15});
--#else
--  return __builtin_shuffle (__a, __b, (uint8x16_t)
--      {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23});
--#endif
-+  return vfmaq_f16 (__a, __b, vdupq_n_f16 (__c));
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vzip1q_p16 (poly16x8_t __a, poly16x8_t __b)
-+__extension__ extern __inline float16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmsh_lane_f16 (float16_t __a, float16_t __b,
-+		float16x4_t __c, const int __lane)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x8_t)
--      {12, 4, 13, 5, 14, 6, 15, 7});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
--#endif
-+  return vfmsh_f16 (__a, __b, __aarch64_vget_lane_any (__c, __lane));
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vzip1q_s8 (int8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline float16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmsh_laneq_f16 (float16_t __a, float16_t __b,
-+		 float16x8_t __c, const int __lane)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x16_t)
--      {24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15});
--#else
--  return __builtin_shuffle (__a, __b, (uint8x16_t)
--      {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23});
--#endif
-+  return vfmsh_f16 (__a, __b, __aarch64_vget_lane_any (__c, __lane));
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vzip1q_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfms_lane_f16 (float16x4_t __a, float16x4_t __b,
-+	       float16x4_t __c, const int __lane)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x8_t)
--      {12, 4, 13, 5, 14, 6, 15, 7});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
--#endif
-+  return vfms_f16 (__a, __b, __aarch64_vdup_lane_f16 (__c, __lane));
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vzip1q_s32 (int32x4_t __a, int32x4_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmsq_lane_f16 (float16x8_t __a, float16x8_t __b,
-+		float16x4_t __c, const int __lane)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {6, 2, 7, 3});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 1, 5});
--#endif
-+  return vfmsq_f16 (__a, __b, __aarch64_vdupq_lane_f16 (__c, __lane));
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vzip1q_s64 (int64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfms_laneq_f16 (float16x4_t __a, float16x4_t __b,
-+		float16x8_t __c, const int __lane)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
--#else
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
--#endif
-+  return vfms_f16 (__a, __b, __aarch64_vdup_laneq_f16 (__c, __lane));
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vzip1q_u8 (uint8x16_t __a, uint8x16_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmsq_laneq_f16 (float16x8_t __a, float16x8_t __b,
-+		 float16x8_t __c, const int __lane)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x16_t)
--      {24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15});
--#else
--  return __builtin_shuffle (__a, __b, (uint8x16_t)
--      {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23});
--#endif
-+  return vfmsq_f16 (__a, __b, __aarch64_vdupq_laneq_f16 (__c, __lane));
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vzip1q_u16 (uint16x8_t __a, uint16x8_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfms_n_f16 (float16x4_t __a, float16x4_t __b, float16_t __c)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x8_t)
--      {12, 4, 13, 5, 14, 6, 15, 7});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
--#endif
-+  return vfms_f16 (__a, __b, vdup_n_f16 (__c));
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vzip1q_u32 (uint32x4_t __a, uint32x4_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmsq_n_f16 (float16x8_t __a, float16x8_t __b, float16_t __c)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {6, 2, 7, 3});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 1, 5});
--#endif
-+  return vfmsq_f16 (__a, __b, vdupq_n_f16 (__c));
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vzip1q_u64 (uint64x2_t __a, uint64x2_t __b)
-+__extension__ extern __inline float16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulh_lane_f16 (float16_t __a, float16x4_t __b, const int __lane)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
--#else
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
--#endif
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
--vzip2_f32 (float32x2_t __a, float32x2_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_lane_f16 (float16x4_t __a, float16x4_t __b, const int __lane)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
--#endif
-+  return vmul_f16 (__a, vdup_n_f16 (__aarch64_vget_lane_any (__b, __lane)));
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
--vzip2_p8 (poly8x8_t __a, poly8x8_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_lane_f16 (float16x8_t __a, float16x4_t __b, const int __lane)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
--#else
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {4, 12, 5, 13, 6, 14, 7, 15});
--#endif
-+  return vmulq_f16 (__a, vdupq_n_f16 (__aarch64_vget_lane_any (__b, __lane)));
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
--vzip2_p16 (poly16x4_t __a, poly16x4_t __b)
-+__extension__ extern __inline float16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulh_laneq_f16 (float16_t __a, float16x8_t __b, const int __lane)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7});
--#endif
-+  return __a * __aarch64_vget_lane_any (__b, __lane);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
--vzip2_s8 (int8x8_t __a, int8x8_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_laneq_f16 (float16x4_t __a, float16x8_t __b, const int __lane)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
--#else
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {4, 12, 5, 13, 6, 14, 7, 15});
--#endif
-+  return vmul_f16 (__a, vdup_n_f16 (__aarch64_vget_lane_any (__b, __lane)));
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
--vzip2_s16 (int16x4_t __a, int16x4_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_laneq_f16 (float16x8_t __a, float16x8_t __b, const int __lane)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7});
--#endif
-+  return vmulq_f16 (__a, vdupq_n_f16 (__aarch64_vget_lane_any (__b, __lane)));
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
--vzip2_s32 (int32x2_t __a, int32x2_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_n_f16 (float16x4_t __a, float16_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
--#endif
-+  return vmul_lane_f16 (__a, vdup_n_f16 (__b), 0);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
--vzip2_u8 (uint8x8_t __a, uint8x8_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_n_f16 (float16x8_t __a, float16_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
--#else
--  return __builtin_shuffle (__a, __b, (uint8x8_t) {4, 12, 5, 13, 6, 14, 7, 15});
--#endif
-+  return vmulq_laneq_f16 (__a, vdupq_n_f16 (__b), 0);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
--vzip2_u16 (uint16x4_t __a, uint16x4_t __b)
-+__extension__ extern __inline float16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxh_lane_f16 (float16_t __a, float16x4_t __b, const int __lane)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7});
--#endif
-+  return vmulxh_f16 (__a, __aarch64_vget_lane_any (__b, __lane));
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
--vzip2_u32 (uint32x2_t __a, uint32x2_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulx_lane_f16 (float16x4_t __a, float16x4_t __b, const int __lane)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
--#endif
-+  return vmulx_f16 (__a, __aarch64_vdup_lane_f16 (__b, __lane));
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
--vzip2q_f32 (float32x4_t __a, float32x4_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxq_lane_f16 (float16x8_t __a, float16x4_t __b, const int __lane)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 5, 1});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {2, 6, 3, 7});
--#endif
-+  return vmulxq_f16 (__a, __aarch64_vdupq_lane_f16 (__b, __lane));
- }
- 
--__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
--vzip2q_f64 (float64x2_t __a, float64x2_t __b)
-+__extension__ extern __inline float16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxh_laneq_f16 (float16_t __a, float16x8_t __b, const int __lane)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
--#else
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
--#endif
-+  return vmulxh_f16 (__a, __aarch64_vget_lane_any (__b, __lane));
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
--vzip2q_p8 (poly8x16_t __a, poly8x16_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulx_laneq_f16 (float16x4_t __a, float16x8_t __b, const int __lane)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x16_t)
--      {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7});
--#else
--  return __builtin_shuffle (__a, __b, (uint8x16_t)
--      {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
--#endif
-+  return vmulx_f16 (__a, __aarch64_vdup_laneq_f16 (__b, __lane));
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
--vzip2q_p16 (poly16x8_t __a, poly16x8_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxq_laneq_f16 (float16x8_t __a, float16x8_t __b, const int __lane)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x8_t)
--      {4, 12, 5, 13, 6, 14, 7, 15});
--#endif
-+  return vmulxq_f16 (__a, __aarch64_vdupq_laneq_f16 (__b, __lane));
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
--vzip2q_s8 (int8x16_t __a, int8x16_t __b)
-+__extension__ extern __inline float16x4_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulx_n_f16 (float16x4_t __a, float16_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x16_t)
--      {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7});
--#else
--  return __builtin_shuffle (__a, __b, (uint8x16_t)
--      {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
--#endif
-+  return vmulx_f16 (__a, vdup_n_f16 (__b));
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
--vzip2q_s16 (int16x8_t __a, int16x8_t __b)
-+__extension__ extern __inline float16x8_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulxq_n_f16 (float16x8_t __a, float16_t __b)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x8_t)
--      {4, 12, 5, 13, 6, 14, 7, 15});
--#endif
-+  return vmulxq_f16 (__a, vdupq_n_f16 (__b));
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
--vzip2q_s32 (int32x4_t __a, int32x4_t __b)
-+/* ARMv8.2-A FP16 reduction vector intrinsics.  */
-+
-+__extension__ extern __inline float16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxv_f16 (float16x4_t __a)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 5, 1});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {2, 6, 3, 7});
--#endif
-+  return __builtin_aarch64_reduc_smax_nan_scal_v4hf (__a);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
--vzip2q_s64 (int64x2_t __a, int64x2_t __b)
-+__extension__ extern __inline float16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxvq_f16 (float16x8_t __a)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
--#else
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
--#endif
-+  return __builtin_aarch64_reduc_smax_nan_scal_v8hf (__a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vzip2q_u8 (uint8x16_t __a, uint8x16_t __b)
-+__extension__ extern __inline float16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminv_f16 (float16x4_t __a)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint8x16_t)
--      {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7});
--#else
--  return __builtin_shuffle (__a, __b, (uint8x16_t)
--      {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
--#endif
-+  return __builtin_aarch64_reduc_smin_nan_scal_v4hf (__a);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
--vzip2q_u16 (uint16x8_t __a, uint16x8_t __b)
-+__extension__ extern __inline float16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminvq_f16 (float16x8_t __a)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
--#else
--  return __builtin_shuffle (__a, __b, (uint16x8_t)
--      {4, 12, 5, 13, 6, 14, 7, 15});
--#endif
-+  return __builtin_aarch64_reduc_smin_nan_scal_v8hf (__a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vzip2q_u32 (uint32x4_t __a, uint32x4_t __b)
-+__extension__ extern __inline float16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxnmv_f16 (float16x4_t __a)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 5, 1});
--#else
--  return __builtin_shuffle (__a, __b, (uint32x4_t) {2, 6, 3, 7});
--#endif
-+  return __builtin_aarch64_reduc_smax_scal_v4hf (__a);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
--vzip2q_u64 (uint64x2_t __a, uint64x2_t __b)
-+__extension__ extern __inline float16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxnmvq_f16 (float16x8_t __a)
- {
--#ifdef __AARCH64EB__
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
--#else
--  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
--#endif
-+  return __builtin_aarch64_reduc_smax_scal_v8hf (__a);
- }
- 
--__INTERLEAVE_LIST (zip)
-+__extension__ extern __inline float16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminnmv_f16 (float16x4_t __a)
-+{
-+  return __builtin_aarch64_reduc_smin_scal_v4hf (__a);
-+}
- 
--#undef __INTERLEAVE_LIST
--#undef __DEFINTERLEAVE
-+__extension__ extern __inline float16_t
-+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-+vminnmvq_f16 (float16x8_t __a)
-+{
-+  return __builtin_aarch64_reduc_smin_scal_v8hf (__a);
-+}
- 
--/* End of optimal implementations in approved order.  */
-+#pragma GCC pop_options
- 
- #undef __aarch64_vget_lane_any
- 
- #undef __aarch64_vdup_lane_any
-+#undef __aarch64_vdup_lane_f16
- #undef __aarch64_vdup_lane_f32
- #undef __aarch64_vdup_lane_f64
- #undef __aarch64_vdup_lane_p8
-@@ -25780,6 +31558,7 @@ __INTERLEAVE_LIST (zip)
- #undef __aarch64_vdup_lane_u16
- #undef __aarch64_vdup_lane_u32
- #undef __aarch64_vdup_lane_u64
-+#undef __aarch64_vdup_laneq_f16
- #undef __aarch64_vdup_laneq_f32
- #undef __aarch64_vdup_laneq_f64
- #undef __aarch64_vdup_laneq_p8
-@@ -25792,6 +31571,7 @@ __INTERLEAVE_LIST (zip)
- #undef __aarch64_vdup_laneq_u16
- #undef __aarch64_vdup_laneq_u32
- #undef __aarch64_vdup_laneq_u64
-+#undef __aarch64_vdupq_lane_f16
- #undef __aarch64_vdupq_lane_f32
- #undef __aarch64_vdupq_lane_f64
- #undef __aarch64_vdupq_lane_p8
-@@ -25804,6 +31584,7 @@ __INTERLEAVE_LIST (zip)
- #undef __aarch64_vdupq_lane_u16
- #undef __aarch64_vdupq_lane_u32
- #undef __aarch64_vdupq_lane_u64
-+#undef __aarch64_vdupq_laneq_f16
- #undef __aarch64_vdupq_laneq_f32
- #undef __aarch64_vdupq_laneq_f64
- #undef __aarch64_vdupq_laneq_p8
-@@ -25817,6 +31598,4 @@ __INTERLEAVE_LIST (zip)
- #undef __aarch64_vdupq_laneq_u32
- #undef __aarch64_vdupq_laneq_u64
- 
--#pragma GCC pop_options
--
- #endif
---- a/src/gcc/config/aarch64/atomics.md
-+++ b/src/gcc/config/aarch64/atomics.md
-@@ -583,7 +583,7 @@
-   }
- )
- 
--;; ARMv8.1 LSE instructions.
-+;; ARMv8.1-A LSE instructions.
- 
- ;; Atomic swap with memory.
- (define_insn "aarch64_atomic_swp<mode>"
---- a/src/gcc/config/aarch64/cortex-a57-fma-steering.c
-+++ b/src/gcc/config/aarch64/cortex-a57-fma-steering.c
-@@ -35,7 +35,6 @@
- #include "context.h"
- #include "tree-pass.h"
- #include "regrename.h"
--#include "cortex-a57-fma-steering.h"
- #include "aarch64-protos.h"
- 
- /* For better performance, the destination of FMADD/FMSUB instructions should
-@@ -923,10 +922,10 @@ func_fma_steering::analyze ()
-       FOR_BB_INSNS (bb, insn)
- 	{
- 	  operand_rr_info *dest_op_info;
--	  struct du_chain *chain;
-+	  struct du_chain *chain = NULL;
- 	  unsigned dest_regno;
--	  fma_forest *forest;
--	  du_head_p head;
-+	  fma_forest *forest = NULL;
-+	  du_head_p head = NULL;
- 	  int i;
- 
- 	  if (!is_fmul_fmac_insn (insn, true))
-@@ -1068,21 +1067,8 @@ public:
- 
- /* Create a new fma steering pass instance.  */
- 
--static rtl_opt_pass *
-+rtl_opt_pass *
- make_pass_fma_steering (gcc::context *ctxt)
- {
-   return new pass_fma_steering (ctxt);
- }
--
--/* Register the FMA steering pass to the pass manager.  */
--
--void
--aarch64_register_fma_steering ()
--{
--  opt_pass *pass_fma_steering = make_pass_fma_steering (g);
--
--  struct register_pass_info fma_steering_info
--    = { pass_fma_steering, "rnreg", 1, PASS_POS_INSERT_AFTER };
--
--  register_pass (&fma_steering_info);
--}
---- a/src/gcc/config/aarch64/cortex-a57-fma-steering.h
-+++ b/src//dev/null
-@@ -1,22 +0,0 @@
--/* This file contains declarations for the FMA steering optimization
--   pass for Cortex-A57.
--   Copyright (C) 2015-2016 Free Software Foundation, Inc.
--   Contributed by ARM Ltd.
--
--   This file is part of GCC.
--
--   GCC is free software; you can redistribute it and/or modify it
--   under the terms of the GNU General Public License as published by
--   the Free Software Foundation; either version 3, or (at your option)
--   any later version.
--
--   GCC is distributed in the hope that it will be useful, but
--   WITHOUT ANY WARRANTY; without even the implied warranty of
--   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
--   General Public License for more details.
--
--   You should have received a copy of the GNU General Public License
--   along with GCC; see the file COPYING3.  If not see
--   <http://www.gnu.org/licenses/>.  */
--
--void aarch64_register_fma_steering (void);
---- a/src/gcc/config/aarch64/geniterators.sh
-+++ b/src/gcc/config/aarch64/geniterators.sh
-@@ -23,10 +23,7 @@
- # BUILTIN_<ITERATOR> macros, which expand to VAR<N> Macros covering the
- # same set of modes as the iterator in iterators.md
- #
--# Find the <ITERATOR> definitions (may span several lines), skip the ones
--# which does not have a simple format because it contains characters we
--# don't want to or can't handle (e.g P, PTR iterators change depending on
--# Pmode and ptr_mode).
-+# Find the <ITERATOR> definitions (may span several lines).
- LC_ALL=C awk '
- BEGIN {
- 	print "/* -*- buffer-read-only: t -*- */"
-@@ -49,12 +46,24 @@ iterdef {
- 	sub(/.*\(define_mode_iterator/, "", s)
- }
- 
--iterdef && s ~ /\)/ {
-+iterdef {
-+	# Count the parentheses, the iterator definition ends
-+	# if there are more closing ones than opening ones.
-+	nopen = gsub(/\(/, "(", s)
-+	nclose = gsub(/\)/, ")", s)
-+	if (nopen >= nclose)
-+		next
-+
- 	iterdef = 0
- 
- 	gsub(/[ \t]+/, " ", s)
--	sub(/ *\).*/, "", s)
-+	sub(/ *\)[^)]*$/, "", s)
- 	sub(/^ /, "", s)
-+
-+	# Drop the conditions.
-+	gsub(/ *"[^"]*" *\)/, "", s)
-+	gsub(/\( */, "", s)
-+
- 	if (s !~ /^[A-Za-z0-9_]+ \[[A-Z0-9 ]*\]$/)
- 		next
- 	sub(/\[ */, "", s)
---- a/src/gcc/config/aarch64/iterators.md
-+++ b/src/gcc/config/aarch64/iterators.md
-@@ -26,6 +26,9 @@
- ;; Iterator for General Purpose Integer registers (32- and 64-bit modes)
- (define_mode_iterator GPI [SI DI])
- 
-+;; Iterator for HI, SI, DI, some instructions can only work on these modes.
-+(define_mode_iterator GPI_I16 [(HI "AARCH64_ISA_F16") SI DI])
-+
- ;; Iterator for QI and HI modes
- (define_mode_iterator SHORT [QI HI])
- 
-@@ -38,6 +41,9 @@
- ;; Iterator for General Purpose Floating-point registers (32- and 64-bit modes)
- (define_mode_iterator GPF [SF DF])
- 
-+;; Iterator for all scalar floating point modes (HF, SF, DF)
-+(define_mode_iterator GPF_F16 [(HF "AARCH64_ISA_F16") SF DF])
-+
- ;; Iterator for all scalar floating point modes (HF, SF, DF and TF)
- (define_mode_iterator GPF_TF_F16 [HF SF DF TF])
- 
-@@ -88,11 +94,22 @@
- ;; Vector Float modes suitable for moving, loading and storing.
- (define_mode_iterator VDQF_F16 [V4HF V8HF V2SF V4SF V2DF])
- 
--;; Vector Float modes, barring HF modes.
-+;; Vector Float modes.
- (define_mode_iterator VDQF [V2SF V4SF V2DF])
-+(define_mode_iterator VHSDF [(V4HF "TARGET_SIMD_F16INST")
-+			     (V8HF "TARGET_SIMD_F16INST")
-+			     V2SF V4SF V2DF])
- 
- ;; Vector Float modes, and DF.
- (define_mode_iterator VDQF_DF [V2SF V4SF V2DF DF])
-+(define_mode_iterator VHSDF_DF [(V4HF "TARGET_SIMD_F16INST")
-+				(V8HF "TARGET_SIMD_F16INST")
-+				V2SF V4SF V2DF DF])
-+(define_mode_iterator VHSDF_HSDF [(V4HF "TARGET_SIMD_F16INST")
-+				  (V8HF "TARGET_SIMD_F16INST")
-+				  V2SF V4SF V2DF
-+				  (HF "TARGET_SIMD_F16INST")
-+				  SF DF])
- 
- ;; Vector single Float modes.
- (define_mode_iterator VDQSF [V2SF V4SF])
-@@ -150,10 +167,30 @@
- 
- ;; Vector modes except double int.
- (define_mode_iterator VDQIF [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF V2DF])
-+(define_mode_iterator VDQIF_F16 [V8QI V16QI V4HI V8HI V2SI V4SI
-+                                 V4HF V8HF V2SF V4SF V2DF])
- 
- ;; Vector modes for S type.
- (define_mode_iterator VDQ_SI [V2SI V4SI])
- 
-+;; Vector modes for S and D
-+(define_mode_iterator VDQ_SDI [V2SI V4SI V2DI])
-+
-+;; Vector modes for H, S and D
-+(define_mode_iterator VDQ_HSDI [(V4HI "TARGET_SIMD_F16INST")
-+				(V8HI "TARGET_SIMD_F16INST")
-+				V2SI V4SI V2DI])
-+
-+;; Scalar and Vector modes for S and D
-+(define_mode_iterator VSDQ_SDI [V2SI V4SI V2DI SI DI])
-+
-+;; Scalar and Vector modes for S and D, Vector modes for H.
-+(define_mode_iterator VSDQ_HSDI [(V4HI "TARGET_SIMD_F16INST")
-+				 (V8HI "TARGET_SIMD_F16INST")
-+				 V2SI V4SI V2DI
-+				 (HI "TARGET_SIMD_F16INST")
-+				 SI DI])
-+
- ;; Vector modes for Q and H types.
- (define_mode_iterator VDQQH [V8QI V16QI V4HI V8HI])
- 
-@@ -193,7 +230,10 @@
- (define_mode_iterator DX [DI DF])
- 
- ;; Modes available for <f>mul lane operations.
--(define_mode_iterator VMUL [V4HI V8HI V2SI V4SI V2SF V4SF V2DF])
-+(define_mode_iterator VMUL [V4HI V8HI V2SI V4SI
-+			    (V4HF "TARGET_SIMD_F16INST")
-+			    (V8HF "TARGET_SIMD_F16INST")
-+			    V2SF V4SF V2DF])
- 
- ;; Modes available for <f>mul lane operations changing lane count.
- (define_mode_iterator VMUL_CHANGE_NLANES [V4HI V8HI V2SI V4SI V2SF V4SF])
-@@ -342,8 +382,8 @@
- (define_mode_attr w [(QI "w") (HI "w") (SI "w") (DI "x") (SF "s") (DF "d")])
- 
- ;; For inequal width int to float conversion
--(define_mode_attr w1 [(SF "w") (DF "x")])
--(define_mode_attr w2 [(SF "x") (DF "w")])
-+(define_mode_attr w1 [(HF "w") (SF "w") (DF "x")])
-+(define_mode_attr w2 [(HF "x") (SF "x") (DF "w")])
- 
- (define_mode_attr short_mask [(HI "65535") (QI "255")])
- 
-@@ -355,12 +395,13 @@
- 
- ;; For scalar usage of vector/FP registers
- (define_mode_attr v [(QI "b") (HI "h") (SI "s") (DI "d")
--		    (SF "s") (DF "d")
-+		    (HF  "h") (SF "s") (DF "d")
- 		    (V8QI "") (V16QI "")
- 		    (V4HI "") (V8HI "")
- 		    (V2SI "") (V4SI  "")
- 		    (V2DI "") (V2SF "")
--		    (V4SF "") (V2DF "")])
-+		    (V4SF "") (V4HF "")
-+		    (V8HF "") (V2DF "")])
- 
- ;; For scalar usage of vector/FP registers, narrowing
- (define_mode_attr vn2 [(QI "") (HI "b") (SI "h") (DI "s")
-@@ -385,7 +426,7 @@
- (define_mode_attr vas [(DI "") (SI ".2s")])
- 
- ;; Map a floating point mode to the appropriate register name prefix
--(define_mode_attr s [(SF "s") (DF "d")])
-+(define_mode_attr s [(HF "h") (SF "s") (DF "d")])
- 
- ;; Give the length suffix letter for a sign- or zero-extension.
- (define_mode_attr size [(QI "b") (HI "h") (SI "w")])
-@@ -421,8 +462,8 @@
- 			 (V4SF ".4s") (V2DF ".2d")
- 			 (DI   "")    (SI   "")
- 			 (HI   "")    (QI   "")
--			 (TI   "")    (SF   "")
--			 (DF   "")])
-+			 (TI   "")    (HF   "")
-+			 (SF   "")    (DF   "")])
- 
- ;; Register suffix narrowed modes for VQN.
- (define_mode_attr Vmntype [(V8HI ".8b") (V4SI ".4h")
-@@ -437,10 +478,21 @@
- 			  (V2DI "d") (V4HF "h")
- 			  (V8HF "h") (V2SF  "s")
- 			  (V4SF "s") (V2DF  "d")
-+			  (HF   "h")
- 			  (SF   "s") (DF  "d")
- 			  (QI "b")   (HI "h")
- 			  (SI "s")   (DI "d")])
- 
-+;; Vetype is used everywhere in scheduling type and assembly output,
-+;; sometimes they are not the same, for example HF modes on some
-+;; instructions.  stype is defined to represent scheduling type
-+;; more accurately.
-+(define_mode_attr stype [(V8QI "b") (V16QI "b") (V4HI "s") (V8HI "s")
-+			 (V2SI "s") (V4SI "s") (V2DI "d") (V4HF "s")
-+			 (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d")
-+			 (HF "s") (SF "s") (DF "d") (QI "b") (HI "s")
-+			 (SI "s") (DI "d")])
-+
- ;; Mode-to-bitwise operation type mapping.
- (define_mode_attr Vbtype [(V8QI "8b")  (V16QI "16b")
- 			  (V4HI "8b") (V8HI  "16b")
-@@ -598,7 +650,7 @@
- 				(V4HF "V4HI") (V8HF  "V8HI")
- 				(V2SF "V2SI") (V4SF  "V4SI")
- 				(V2DF "V2DI") (DF    "DI")
--				(SF   "SI")])
-+				(SF   "SI")   (HF    "HI")])
- 
- ;; Lower case mode of results of comparison operations.
- (define_mode_attr v_cmp_result [(V8QI "v8qi") (V16QI "v16qi")
-@@ -648,12 +700,21 @@
- (define_mode_attr atomic_sfx
-   [(QI "b") (HI "h") (SI "") (DI "")])
- 
--(define_mode_attr fcvt_target [(V2DF "v2di") (V4SF "v4si") (V2SF "v2si") (SF "si") (DF "di")])
--(define_mode_attr FCVT_TARGET [(V2DF "V2DI") (V4SF "V4SI") (V2SF "V2SI") (SF "SI") (DF "DI")])
-+(define_mode_attr fcvt_target [(V2DF "v2di") (V4SF "v4si") (V2SF "v2si")
-+			       (V2DI "v2df") (V4SI "v4sf") (V2SI "v2sf")
-+			       (SF "si") (DF "di") (SI "sf") (DI "df")
-+			       (V4HF "v4hi") (V8HF "v8hi") (V4HI "v4hf")
-+			       (V8HI "v8hf") (HF "hi") (HI "hf")])
-+(define_mode_attr FCVT_TARGET [(V2DF "V2DI") (V4SF "V4SI") (V2SF "V2SI")
-+			       (V2DI "V2DF") (V4SI "V4SF") (V2SI "V2SF")
-+			       (SF "SI") (DF "DI") (SI "SF") (DI "DF")
-+			       (V4HF "V4HI") (V8HF "V8HI") (V4HI "V4HF")
-+			       (V8HI "V8HF") (HF "HI") (HI "HF")])
-+
- 
- ;; for the inequal width integer to fp conversions
--(define_mode_attr fcvt_iesize [(SF "di") (DF "si")])
--(define_mode_attr FCVT_IESIZE [(SF "DI") (DF "SI")])
-+(define_mode_attr fcvt_iesize [(HF "di") (SF "di") (DF "si")])
-+(define_mode_attr FCVT_IESIZE [(HF "DI") (SF "DI") (DF "SI")])
- 
- (define_mode_attr VSWAP_WIDTH [(V8QI "V16QI") (V16QI "V8QI")
- 				(V4HI "V8HI") (V8HI  "V4HI")
-@@ -676,6 +737,7 @@
- ;; the 'x' constraint.  All other modes may use the 'w' constraint.
- (define_mode_attr h_con [(V2SI "w") (V4SI "w")
- 			 (V4HI "x") (V8HI "x")
-+			 (V4HF "w") (V8HF "w")
- 			 (V2SF "w") (V4SF "w")
- 			 (V2DF "w") (DF "w")])
- 
-@@ -684,6 +746,7 @@
- 		     (V4HI "")  (V8HI  "")
- 		     (V2SI "")  (V4SI  "")
- 		     (DI   "")  (V2DI  "")
-+		     (V4HF "f") (V8HF  "f")
- 		     (V2SF "f") (V4SF  "f")
- 		     (V2DF "f") (DF    "f")])
- 
-@@ -692,6 +755,7 @@
- 		      (V4HI "")  (V8HI  "")
- 		      (V2SI "")  (V4SI  "")
- 		      (DI   "")  (V2DI  "")
-+		      (V4HF "_fp") (V8HF  "_fp")
- 		      (V2SF "_fp") (V4SF  "_fp")
- 		      (V2DF "_fp") (DF    "_fp")
- 		      (SF "_fp")])
-@@ -704,17 +768,19 @@
- 		     (V4HF "") (V8HF "_q")
- 		     (V2SF "") (V4SF  "_q")
- 			       (V2DF  "_q")
--		     (QI "") (HI "") (SI "") (DI "") (SF "") (DF "")])
-+		     (QI "") (HI "") (SI "") (DI "") (HF "") (SF "") (DF "")])
- 
- (define_mode_attr vp [(V8QI "v") (V16QI "v")
- 		      (V4HI "v") (V8HI  "v")
- 		      (V2SI "p") (V4SI  "v")
--		      (V2DI  "p") (V2DF  "p")
--		      (V2SF "p") (V4SF  "v")])
-+		      (V2DI "p") (V2DF  "p")
-+		      (V2SF "p") (V4SF  "v")
-+		      (V4HF "v") (V8HF  "v")])
- 
- (define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")])
- (define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")])
- 
-+;; Sum of lengths of instructions needed to move vector registers of a mode.
- (define_mode_attr insn_count [(OI "8") (CI "12") (XI "16")])
- 
- ;; -fpic small model GOT reloc modifers: gotpage_lo15/lo14 for ILP64/32.
-@@ -876,9 +942,6 @@
- ;; Similar, but when not(op)
- (define_code_attr nlogical [(and "bic") (ior "orn") (xor "eon")])
- 
--;; Sign- or zero-extending load
--(define_code_attr ldrxt [(sign_extend "ldrs") (zero_extend "ldr")])
--
- ;; Sign- or zero-extending data-op
- (define_code_attr su [(sign_extend "s") (zero_extend "u")
- 		      (sign_extract "s") (zero_extract "u")
-@@ -953,9 +1016,8 @@
- (define_int_iterator ADDSUBHN2 [UNSPEC_ADDHN2 UNSPEC_RADDHN2
- 			        UNSPEC_SUBHN2 UNSPEC_RSUBHN2])
- 
--(define_int_iterator FMAXMIN_UNS [UNSPEC_FMAX UNSPEC_FMIN])
--
--(define_int_iterator FMAXMIN [UNSPEC_FMAXNM UNSPEC_FMINNM])
-+(define_int_iterator FMAXMIN_UNS [UNSPEC_FMAX UNSPEC_FMIN
-+				  UNSPEC_FMAXNM UNSPEC_FMINNM])
- 
- (define_int_iterator VQDMULH [UNSPEC_SQDMULH UNSPEC_SQRDMULH])
- 
-@@ -1001,6 +1063,9 @@
- (define_int_iterator FCVT [UNSPEC_FRINTZ UNSPEC_FRINTP UNSPEC_FRINTM
- 			    UNSPEC_FRINTA UNSPEC_FRINTN])
- 
-+(define_int_iterator FCVT_F2FIXED [UNSPEC_FCVTZS UNSPEC_FCVTZU])
-+(define_int_iterator FCVT_FIXED2F [UNSPEC_SCVTF UNSPEC_UCVTF])
-+
- (define_int_iterator FRECP [UNSPEC_FRECPE UNSPEC_FRECPX])
- 
- (define_int_iterator CRC [UNSPEC_CRC32B UNSPEC_CRC32H UNSPEC_CRC32W
-@@ -1036,7 +1101,9 @@
- 			      (UNSPEC_FMAXV "smax_nan")
- 			      (UNSPEC_FMIN "smin_nan")
- 			      (UNSPEC_FMINNMV "smin")
--			      (UNSPEC_FMINV "smin_nan")])
-+			      (UNSPEC_FMINV "smin_nan")
-+			      (UNSPEC_FMAXNM "fmax")
-+			      (UNSPEC_FMINNM "fmin")])
- 
- (define_int_attr  maxmin_uns_op [(UNSPEC_UMAXV "umax")
- 				 (UNSPEC_UMINV "umin")
-@@ -1047,13 +1114,9 @@
- 				 (UNSPEC_FMAXV "fmax")
- 				 (UNSPEC_FMIN "fmin")
- 				 (UNSPEC_FMINNMV "fminnm")
--				 (UNSPEC_FMINV "fmin")])
--
--(define_int_attr fmaxmin [(UNSPEC_FMAXNM "fmax")
--			  (UNSPEC_FMINNM "fmin")])
--
--(define_int_attr fmaxmin_op [(UNSPEC_FMAXNM "fmaxnm")
--			     (UNSPEC_FMINNM "fminnm")])
-+				 (UNSPEC_FMINV "fmin")
-+				 (UNSPEC_FMAXNM "fmaxnm")
-+				 (UNSPEC_FMINNM "fminnm")])
- 
- (define_int_attr sur [(UNSPEC_SHADD "s") (UNSPEC_UHADD "u")
- 		      (UNSPEC_SRHADD "sr") (UNSPEC_URHADD "ur")
-@@ -1137,6 +1200,11 @@
- 			       (UNSPEC_FRINTP "ceil") (UNSPEC_FRINTM "floor")
- 			       (UNSPEC_FRINTN "frintn")])
- 
-+(define_int_attr fcvt_fixed_insn [(UNSPEC_SCVTF "scvtf")
-+				  (UNSPEC_UCVTF "ucvtf")
-+				  (UNSPEC_FCVTZS "fcvtzs")
-+				  (UNSPEC_FCVTZU "fcvtzu")])
-+
- (define_int_attr perm_insn [(UNSPEC_ZIP1 "zip") (UNSPEC_ZIP2 "zip")
- 			    (UNSPEC_TRN1 "trn") (UNSPEC_TRN2 "trn")
- 			    (UNSPEC_UZP1 "uzp") (UNSPEC_UZP2 "uzp")])
---- a/src/gcc/config/aarch64/predicates.md
-+++ b/src/gcc/config/aarch64/predicates.md
-@@ -54,9 +54,9 @@
- 	    (match_test "op == const0_rtx"))))
- 
- (define_predicate "aarch64_reg_or_fp_zero"
--  (and (match_code "reg,subreg,const_double")
--       (ior (match_operand 0 "register_operand")
--	    (match_test "aarch64_float_const_zero_rtx_p (op)"))))
-+  (ior (match_operand 0 "register_operand")
-+	(and (match_code "const_double")
-+	     (match_test "aarch64_float_const_zero_rtx_p (op)"))))
- 
- (define_predicate "aarch64_reg_zero_or_m1_or_1"
-   (and (match_code "reg,subreg,const_int")
---- a/src/gcc/config/aarch64/t-aarch64
-+++ b/src/gcc/config/aarch64/t-aarch64
-@@ -52,16 +52,17 @@ aarch-common.o: $(srcdir)/config/arm/aarch-common.c $(CONFIG_H) $(SYSTEM_H) \
- 		$(srcdir)/config/arm/aarch-common.c
- 
- aarch64-c.o: $(srcdir)/config/aarch64/aarch64-c.c $(CONFIG_H) $(SYSTEM_H) \
--    coretypes.h $(TM_H) $(TREE_H) output.h $(C_COMMON_H)
-+    coretypes.h $(TM_H) $(TREE_H) output.h $(C_COMMON_H) $(TARGET_H)
- 	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
- 		$(srcdir)/config/aarch64/aarch64-c.c
- 
-+PASSES_EXTRA += $(srcdir)/config/aarch64/aarch64-passes.def
-+
- cortex-a57-fma-steering.o: $(srcdir)/config/aarch64/cortex-a57-fma-steering.c \
-     $(CONFIG_H) $(SYSTEM_H) $(TM_H) $(REGS_H) insn-config.h $(RTL_BASE_H) \
-     dominance.h cfg.h cfganal.h $(BASIC_BLOCK_H) $(INSN_ATTR_H) $(RECOG_H) \
-     output.h hash-map.h $(DF_H) $(OBSTACK_H) $(TARGET_H) $(RTL_H) \
-     $(CONTEXT_H) $(TREE_PASS_H) regrename.h \
--    $(srcdir)/config/aarch64/cortex-a57-fma-steering.h \
-     $(srcdir)/config/aarch64/aarch64-protos.h
- 	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
- 		$(srcdir)/config/aarch64/cortex-a57-fma-steering.c
---- a/src/gcc/config/aarch64/thunderx.md
-+++ b/src/gcc/config/aarch64/thunderx.md
-@@ -39,7 +39,7 @@
- 
- (define_insn_reservation "thunderx_shift" 1
-   (and (eq_attr "tune" "thunderx")
--       (eq_attr "type" "bfm,extend,rotate_imm,shift_imm,shift_reg,rbit,rev"))
-+       (eq_attr "type" "bfm,bfx,extend,rotate_imm,shift_imm,shift_reg,rbit,rev"))
-   "thunderx_pipe0 | thunderx_pipe1")
- 
- 
---- a/src/gcc/config/alpha/alpha.c
-+++ b/src/gcc/config/alpha/alpha.c
-@@ -26,6 +26,7 @@ along with GCC; see the file COPYING3.  If not see
- #include "target.h"
- #include "rtl.h"
- #include "tree.h"
-+#include "memmodel.h"
- #include "gimple.h"
- #include "df.h"
- #include "tm_p.h"
---- a/src/gcc/config/arm/aarch-cost-tables.h
-+++ b/src/gcc/config/arm/aarch-cost-tables.h
-@@ -191,35 +191,35 @@ const struct cpu_cost_table cortexa53_extra_costs =
-   {
-     /* FP SFmode */
-     {
--      COSTS_N_INSNS (15),	/* div.  */
--      COSTS_N_INSNS (3),	/* mult.  */
--      COSTS_N_INSNS (7),	/* mult_addsub. */
--      COSTS_N_INSNS (7),	/* fma.  */
--      COSTS_N_INSNS (3),	/* addsub.  */
--      COSTS_N_INSNS (1),	/* fpconst. */
--      COSTS_N_INSNS (2),	/* neg.  */
--      COSTS_N_INSNS (1),	/* compare.  */
--      COSTS_N_INSNS (3),	/* widen.  */
--      COSTS_N_INSNS (3),	/* narrow.  */
--      COSTS_N_INSNS (3),	/* toint.  */
--      COSTS_N_INSNS (3),	/* fromint.  */
--      COSTS_N_INSNS (3)		/* roundint.  */
-+      COSTS_N_INSNS (5),	/* div.  */
-+      COSTS_N_INSNS (1),	/* mult.  */
-+      COSTS_N_INSNS (2),	/* mult_addsub.  */
-+      COSTS_N_INSNS (2),	/* fma.  */
-+      COSTS_N_INSNS (1),	/* addsub.  */
-+      0,			/* fpconst.  */
-+      COSTS_N_INSNS (1),	/* neg.  */
-+      0,			/* compare.  */
-+      COSTS_N_INSNS (1),	/* widen.  */
-+      COSTS_N_INSNS (1),	/* narrow.  */
-+      COSTS_N_INSNS (1),	/* toint.  */
-+      COSTS_N_INSNS (1),	/* fromint.  */
-+      COSTS_N_INSNS (1)		/* roundint.  */
-     },
-     /* FP DFmode */
-     {
--      COSTS_N_INSNS (30),	/* div.  */
--      COSTS_N_INSNS (3),	/* mult.  */
--      COSTS_N_INSNS (7),	/* mult_addsub.  */
--      COSTS_N_INSNS (7),	/* fma.  */
--      COSTS_N_INSNS (3),	/* addsub.  */
--      COSTS_N_INSNS (1),	/* fpconst.  */
--      COSTS_N_INSNS (2),	/* neg.  */
--      COSTS_N_INSNS (1),	/* compare.  */
--      COSTS_N_INSNS (3),	/* widen.  */
--      COSTS_N_INSNS (3),	/* narrow.  */
--      COSTS_N_INSNS (3),	/* toint.  */
--      COSTS_N_INSNS (3),	/* fromint.  */
--      COSTS_N_INSNS (3)		/* roundint.  */
-+      COSTS_N_INSNS (10),	/* div.  */
-+      COSTS_N_INSNS (1),	/* mult.  */
-+      COSTS_N_INSNS (2),	/* mult_addsub.  */
-+      COSTS_N_INSNS (2),	/* fma.  */
-+      COSTS_N_INSNS (1),	/* addsub.  */
-+      0,			/* fpconst.  */
-+      COSTS_N_INSNS (1),	/* neg.  */
-+      0,			/* compare.  */
-+      COSTS_N_INSNS (1),	/* widen.  */
-+      COSTS_N_INSNS (1),	/* narrow.  */
-+      COSTS_N_INSNS (1),	/* toint.  */
-+      COSTS_N_INSNS (1),	/* fromint.  */
-+      COSTS_N_INSNS (1)		/* roundint.  */
-     }
-   },
-   /* Vector */
-@@ -294,35 +294,35 @@ const struct cpu_cost_table cortexa57_extra_costs =
-   {
-     /* FP SFmode */
-     {
--      COSTS_N_INSNS (17),      /* div.  */
--      COSTS_N_INSNS (5),       /* mult.  */
--      COSTS_N_INSNS (9),       /* mult_addsub. */
--      COSTS_N_INSNS (9),       /* fma.  */
--      COSTS_N_INSNS (4),       /* addsub.  */
--      COSTS_N_INSNS (2),       /* fpconst. */
--      COSTS_N_INSNS (2),       /* neg.  */
--      COSTS_N_INSNS (2),       /* compare.  */
--      COSTS_N_INSNS (4),       /* widen.  */
--      COSTS_N_INSNS (4),       /* narrow.  */
--      COSTS_N_INSNS (4),       /* toint.  */
--      COSTS_N_INSNS (4),       /* fromint.  */
--      COSTS_N_INSNS (4)        /* roundint.  */
-+      COSTS_N_INSNS (6),      /* div.  */
-+      COSTS_N_INSNS (1),       /* mult.  */
-+      COSTS_N_INSNS (2),       /* mult_addsub.  */
-+      COSTS_N_INSNS (2),       /* fma.  */
-+      COSTS_N_INSNS (1),       /* addsub.  */
-+      0,		       /* fpconst.  */
-+      0,		       /* neg.  */
-+      0,		       /* compare.  */
-+      COSTS_N_INSNS (1),       /* widen.  */
-+      COSTS_N_INSNS (1),       /* narrow.  */
-+      COSTS_N_INSNS (1),       /* toint.  */
-+      COSTS_N_INSNS (1),       /* fromint.  */
-+      COSTS_N_INSNS (1)        /* roundint.  */
-     },
-     /* FP DFmode */
-     {
--      COSTS_N_INSNS (31),      /* div.  */
--      COSTS_N_INSNS (5),       /* mult.  */
--      COSTS_N_INSNS (9),       /* mult_addsub.  */
--      COSTS_N_INSNS (9),       /* fma.  */
--      COSTS_N_INSNS (4),       /* addsub.  */
--      COSTS_N_INSNS (2),       /* fpconst.  */
--      COSTS_N_INSNS (2),       /* neg.  */
--      COSTS_N_INSNS (2),       /* compare.  */
--      COSTS_N_INSNS (4),       /* widen.  */
--      COSTS_N_INSNS (4),       /* narrow.  */
--      COSTS_N_INSNS (4),       /* toint.  */
--      COSTS_N_INSNS (4),       /* fromint.  */
--      COSTS_N_INSNS (4)        /* roundint.  */
-+      COSTS_N_INSNS (11),      /* div.  */
-+      COSTS_N_INSNS (1),       /* mult.  */
-+      COSTS_N_INSNS (2),       /* mult_addsub.  */
-+      COSTS_N_INSNS (2),       /* fma.  */
-+      COSTS_N_INSNS (1),       /* addsub.  */
-+      0,		       /* fpconst.  */
-+      0,		       /* neg.  */
-+      0,		       /* compare.  */
-+      COSTS_N_INSNS (1),       /* widen.  */
-+      COSTS_N_INSNS (1),       /* narrow.  */
-+      COSTS_N_INSNS (1),       /* toint.  */
-+      COSTS_N_INSNS (1),       /* fromint.  */
-+      COSTS_N_INSNS (1)        /* roundint.  */
-     }
-   },
-   /* Vector */
-@@ -537,4 +537,107 @@ const struct cpu_cost_table xgene1_extra_costs =
-   }
- };
- 
-+const struct cpu_cost_table qdf24xx_extra_costs =
-+{
-+  /* ALU */
-+  {
-+    0,                 /* arith.  */
-+    0,                 /* logical.  */
-+    0,                 /* shift.  */
-+    0,                 /* shift_reg.  */
-+    COSTS_N_INSNS (1), /* arith_shift.  */
-+    COSTS_N_INSNS (1), /* arith_shift_reg.  */
-+    0,                 /* log_shift.  */
-+    0,                 /* log_shift_reg.  */
-+    0,                 /* extend.  */
-+    0,                 /* extend_arith.  */
-+    0,                 /* bfi.  */
-+    0,                 /* bfx.  */
-+    0,                 /* clz.  */
-+    0,	               /* rev.  */
-+    0,                 /* non_exec.  */
-+    true               /* non_exec_costs_exec.  */
-+  },
-+  {
-+    /* MULT SImode */
-+    {
-+      COSTS_N_INSNS (2),       /* simple.  */
-+      COSTS_N_INSNS (2),       /* flag_setting.  */
-+      COSTS_N_INSNS (2),       /* extend.  */
-+      COSTS_N_INSNS (2),       /* add.  */
-+      COSTS_N_INSNS (2),       /* extend_add.  */
-+      COSTS_N_INSNS (4)       /* idiv.  */
-+    },
-+    /* MULT DImode */
-+    {
-+      COSTS_N_INSNS (3),       /* simple.  */
-+      0,                       /* flag_setting (N/A).  */
-+      COSTS_N_INSNS (3),       /* extend.  */
-+      COSTS_N_INSNS (3),       /* add.  */
-+      COSTS_N_INSNS (3),       /* extend_add.  */
-+      COSTS_N_INSNS (9)       /* idiv.  */
-+    }
-+  },
-+  /* LD/ST */
-+  {
-+    COSTS_N_INSNS (2),         /* load.  */
-+    COSTS_N_INSNS (2),         /* load_sign_extend.  */
-+    COSTS_N_INSNS (2),         /* ldrd.  */
-+    COSTS_N_INSNS (2),         /* ldm_1st.  */
-+    1,                         /* ldm_regs_per_insn_1st.  */
-+    2,                         /* ldm_regs_per_insn_subsequent.  */
-+    COSTS_N_INSNS (2),         /* loadf.  */
-+    COSTS_N_INSNS (2),         /* loadd.  */
-+    COSTS_N_INSNS (3),         /* load_unaligned.  */
-+    0,                         /* store.  */
-+    0,                         /* strd.  */
-+    0,                         /* stm_1st.  */
-+    1,                         /* stm_regs_per_insn_1st.  */
-+    2,                         /* stm_regs_per_insn_subsequent.  */
-+    0,                         /* storef.  */
-+    0,                         /* stored.  */
-+    COSTS_N_INSNS (1),         /* store_unaligned.  */
-+    COSTS_N_INSNS (1),         /* loadv.  */
-+    COSTS_N_INSNS (1)          /* storev.  */
-+  },
-+  {
-+    /* FP SFmode */
-+    {
-+      COSTS_N_INSNS (6),      /* div.  */
-+      COSTS_N_INSNS (5),       /* mult.  */
-+      COSTS_N_INSNS (5),       /* mult_addsub. */
-+      COSTS_N_INSNS (5),       /* fma.  */
-+      COSTS_N_INSNS (3),       /* addsub.  */
-+      COSTS_N_INSNS (1),       /* fpconst. */
-+      COSTS_N_INSNS (1),       /* neg.  */
-+      COSTS_N_INSNS (2),       /* compare.  */
-+      COSTS_N_INSNS (4),       /* widen.  */
-+      COSTS_N_INSNS (4),       /* narrow.  */
-+      COSTS_N_INSNS (4),       /* toint.  */
-+      COSTS_N_INSNS (4),       /* fromint.  */
-+      COSTS_N_INSNS (2)        /* roundint.  */
-+    },
-+    /* FP DFmode */
-+    {
-+      COSTS_N_INSNS (11),      /* div.  */
-+      COSTS_N_INSNS (6),       /* mult.  */
-+      COSTS_N_INSNS (6),       /* mult_addsub.  */
-+      COSTS_N_INSNS (6),       /* fma.  */
-+      COSTS_N_INSNS (3),       /* addsub.  */
-+      COSTS_N_INSNS (1),       /* fpconst.  */
-+      COSTS_N_INSNS (1),       /* neg.  */
-+      COSTS_N_INSNS (2),       /* compare.  */
-+      COSTS_N_INSNS (4),       /* widen.  */
-+      COSTS_N_INSNS (4),       /* narrow.  */
-+      COSTS_N_INSNS (4),       /* toint.  */
-+      COSTS_N_INSNS (4),       /* fromint.  */
-+      COSTS_N_INSNS (2)        /* roundint.  */
-+    }
-+  },
-+  /* Vector */
-+  {
-+    COSTS_N_INSNS (1)  /* alu.  */
-+  }
-+};
-+
- #endif /* GCC_AARCH_COST_TABLES_H */
---- a/src/gcc/config/arm/arm-arches.def
-+++ b/src/gcc/config/arm/arm-arches.def
-@@ -58,10 +58,22 @@ ARM_ARCH("armv7e-m", cortexm4,  7EM,	ARM_FSET_MAKE_CPU1 (FL_CO_PROC |	      FL_F
- ARM_ARCH("armv8-a", cortexa53,  8A,	ARM_FSET_MAKE_CPU1 (FL_CO_PROC |             FL_FOR_ARCH8A))
- ARM_ARCH("armv8-a+crc",cortexa53, 8A,   ARM_FSET_MAKE_CPU1 (FL_CO_PROC | FL_CRC32  | FL_FOR_ARCH8A))
- ARM_ARCH("armv8.1-a", cortexa53,  8A,
--	  ARM_FSET_MAKE (FL_CO_PROC | FL_FOR_ARCH8A,  FL2_FOR_ARCH8_1A))
-+	  ARM_FSET_MAKE (FL_CO_PROC | FL_CRC32 | FL_FOR_ARCH8A,
-+			 FL2_FOR_ARCH8_1A))
- ARM_ARCH("armv8.1-a+crc",cortexa53, 8A,
- 	  ARM_FSET_MAKE (FL_CO_PROC | FL_CRC32 | FL_FOR_ARCH8A,
- 			 FL2_FOR_ARCH8_1A))
-+ARM_ARCH ("armv8.2-a", cortexa53,  8A,
-+	  ARM_FSET_MAKE (FL_CO_PROC | FL_CRC32 | FL_FOR_ARCH8A,
-+			 FL2_FOR_ARCH8_2A))
-+ARM_ARCH ("armv8.2-a+fp16", cortexa53,  8A,
-+	  ARM_FSET_MAKE (FL_CO_PROC | FL_CRC32 | FL_FOR_ARCH8A,
-+			 FL2_FOR_ARCH8_2A | FL2_FP16INST))
-+ARM_ARCH("armv8-m.base", cortexm23, 8M_BASE,
-+	  ARM_FSET_MAKE (FL_FOR_ARCH8M_BASE, FL2_CMSE))
-+ARM_ARCH("armv8-m.main", cortexm7, 8M_MAIN,
-+	  ARM_FSET_MAKE (FL_CO_PROC | FL_FOR_ARCH8M_MAIN, FL2_CMSE))
-+ARM_ARCH("armv8-m.main+dsp", cortexm33, 8M_MAIN,
-+	  ARM_FSET_MAKE (FL_CO_PROC | FL_ARCH7EM | FL_FOR_ARCH8M_MAIN, FL2_CMSE))
- ARM_ARCH("iwmmxt",  iwmmxt,     5TE,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_STRONG | FL_FOR_ARCH5TE | FL_XSCALE | FL_IWMMXT))
- ARM_ARCH("iwmmxt2", iwmmxt2,    5TE,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_STRONG | FL_FOR_ARCH5TE | FL_XSCALE | FL_IWMMXT | FL_IWMMXT2))
--
---- a/src/gcc/config/arm/arm-builtins.c
-+++ b/src/gcc/config/arm/arm-builtins.c
-@@ -190,6 +190,8 @@ arm_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
- #define ti_UP	 TImode
- #define ei_UP	 EImode
- #define oi_UP	 OImode
-+#define hf_UP	 HFmode
-+#define si_UP	 SImode
- 
- #define UP(X) X##_UP
- 
-@@ -239,12 +241,22 @@ typedef struct {
-   VAR11 (T, N, A, B, C, D, E, F, G, H, I, J, K) \
-   VAR1 (T, N, L)
- 
--/* The NEON builtin data can be found in arm_neon_builtins.def.
--   The mode entries in the following table correspond to the "key" type of the
--   instruction variant, i.e. equivalent to that which would be specified after
--   the assembler mnemonic, which usually refers to the last vector operand.
--   The modes listed per instruction should be the same as those defined for
--   that instruction's pattern in neon.md.  */
-+/* The NEON builtin data can be found in arm_neon_builtins.def and
-+   arm_vfp_builtins.def.  The entries in arm_neon_builtins.def require
-+   TARGET_NEON to be true.  The feature tests are checked when the
-+   builtins are expanded.
-+
-+   The mode entries in the following table correspond to the "key"
-+   type of the instruction variant, i.e. equivalent to that which
-+   would be specified after the assembler mnemonic, which usually
-+   refers to the last vector operand.  The modes listed per
-+   instruction should be the same as those defined for that
-+   instruction's pattern in neon.md.  */
-+
-+static neon_builtin_datum vfp_builtin_data[] =
-+{
-+#include "arm_vfp_builtins.def"
-+};
- 
- static neon_builtin_datum neon_builtin_data[] =
- {
-@@ -515,6 +527,8 @@ enum arm_builtins
-   ARM_BUILTIN_GET_FPSCR,
-   ARM_BUILTIN_SET_FPSCR,
- 
-+  ARM_BUILTIN_CMSE_NONSECURE_CALLER,
-+
- #undef CRYPTO1
- #undef CRYPTO2
- #undef CRYPTO3
-@@ -534,6 +548,10 @@ enum arm_builtins
- #undef CRYPTO2
- #undef CRYPTO3
- 
-+  ARM_BUILTIN_VFP_BASE,
-+
-+#include "arm_vfp_builtins.def"
-+
-   ARM_BUILTIN_NEON_BASE,
-   ARM_BUILTIN_NEON_LANE_CHECK = ARM_BUILTIN_NEON_BASE,
- 
-@@ -542,8 +560,11 @@ enum arm_builtins
-   ARM_BUILTIN_MAX
- };
- 
-+#define ARM_BUILTIN_VFP_PATTERN_START \
-+  (ARM_BUILTIN_VFP_BASE + 1)
-+
- #define ARM_BUILTIN_NEON_PATTERN_START \
--    (ARM_BUILTIN_MAX - ARRAY_SIZE (neon_builtin_data))
-+  (ARM_BUILTIN_NEON_BASE + 1)
- 
- #undef CF
- #undef VAR1
-@@ -895,6 +916,110 @@ arm_init_simd_builtin_scalar_types (void)
- 					     "__builtin_neon_uti");
- }
- 
-+/* Set up a NEON builtin.  */
-+
-+static void
-+arm_init_neon_builtin (unsigned int fcode,
-+		       neon_builtin_datum *d)
-+{
-+  bool print_type_signature_p = false;
-+  char type_signature[SIMD_MAX_BUILTIN_ARGS] = { 0 };
-+  char namebuf[60];
-+  tree ftype = NULL;
-+  tree fndecl = NULL;
-+
-+  d->fcode = fcode;
-+
-+  /* We must track two variables here.  op_num is
-+     the operand number as in the RTL pattern.  This is
-+     required to access the mode (e.g. V4SF mode) of the
-+     argument, from which the base type can be derived.
-+     arg_num is an index in to the qualifiers data, which
-+     gives qualifiers to the type (e.g. const unsigned).
-+     The reason these two variables may differ by one is the
-+     void return type.  While all return types take the 0th entry
-+     in the qualifiers array, there is no operand for them in the
-+     RTL pattern.  */
-+  int op_num = insn_data[d->code].n_operands - 1;
-+  int arg_num = d->qualifiers[0] & qualifier_void
-+    ? op_num + 1
-+    : op_num;
-+  tree return_type = void_type_node, args = void_list_node;
-+  tree eltype;
-+
-+  /* Build a function type directly from the insn_data for this
-+     builtin.  The build_function_type () function takes care of
-+     removing duplicates for us.  */
-+  for (; op_num >= 0; arg_num--, op_num--)
-+    {
-+      machine_mode op_mode = insn_data[d->code].operand[op_num].mode;
-+      enum arm_type_qualifiers qualifiers = d->qualifiers[arg_num];
-+
-+      if (qualifiers & qualifier_unsigned)
-+	{
-+	  type_signature[arg_num] = 'u';
-+	  print_type_signature_p = true;
-+	}
-+      else if (qualifiers & qualifier_poly)
-+	{
-+	  type_signature[arg_num] = 'p';
-+	  print_type_signature_p = true;
-+	}
-+      else
-+	type_signature[arg_num] = 's';
-+
-+      /* Skip an internal operand for vget_{low, high}.  */
-+      if (qualifiers & qualifier_internal)
-+	continue;
-+
-+      /* Some builtins have different user-facing types
-+	 for certain arguments, encoded in d->mode.  */
-+      if (qualifiers & qualifier_map_mode)
-+	op_mode = d->mode;
-+
-+      /* For pointers, we want a pointer to the basic type
-+	 of the vector.  */
-+      if (qualifiers & qualifier_pointer && VECTOR_MODE_P (op_mode))
-+	op_mode = GET_MODE_INNER (op_mode);
-+
-+      eltype = arm_simd_builtin_type
-+	(op_mode,
-+	 (qualifiers & qualifier_unsigned) != 0,
-+	 (qualifiers & qualifier_poly) != 0);
-+      gcc_assert (eltype != NULL);
-+
-+      /* Add qualifiers.  */
-+      if (qualifiers & qualifier_const)
-+	eltype = build_qualified_type (eltype, TYPE_QUAL_CONST);
-+
-+      if (qualifiers & qualifier_pointer)
-+	eltype = build_pointer_type (eltype);
-+
-+      /* If we have reached arg_num == 0, we are at a non-void
-+	 return type.  Otherwise, we are still processing
-+	 arguments.  */
-+      if (arg_num == 0)
-+	return_type = eltype;
-+      else
-+	args = tree_cons (NULL_TREE, eltype, args);
-+    }
-+
-+  ftype = build_function_type (return_type, args);
-+
-+  gcc_assert (ftype != NULL);
-+
-+  if (print_type_signature_p)
-+    snprintf (namebuf, sizeof (namebuf), "__builtin_neon_%s_%s",
-+	      d->name, type_signature);
-+  else
-+    snprintf (namebuf, sizeof (namebuf), "__builtin_neon_%s",
-+	      d->name);
-+
-+  fndecl = add_builtin_function (namebuf, ftype, fcode, BUILT_IN_MD,
-+				 NULL, NULL_TREE);
-+  arm_builtin_decls[fcode] = fndecl;
-+}
-+
- /* Set up all the NEON builtins, even builtins for instructions that are not
-    in the current target ISA to allow the user to compile particular modules
-    with different target specific options that differ from the command line
-@@ -924,103 +1049,22 @@ arm_init_neon_builtins (void)
- 
-   for (i = 0; i < ARRAY_SIZE (neon_builtin_data); i++, fcode++)
-     {
--      bool print_type_signature_p = false;
--      char type_signature[SIMD_MAX_BUILTIN_ARGS] = { 0 };
-       neon_builtin_datum *d = &neon_builtin_data[i];
--      char namebuf[60];
--      tree ftype = NULL;
--      tree fndecl = NULL;
--
--      d->fcode = fcode;
--
--      /* We must track two variables here.  op_num is
--	 the operand number as in the RTL pattern.  This is
--	 required to access the mode (e.g. V4SF mode) of the
--	 argument, from which the base type can be derived.
--	 arg_num is an index in to the qualifiers data, which
--	 gives qualifiers to the type (e.g. const unsigned).
--	 The reason these two variables may differ by one is the
--	 void return type.  While all return types take the 0th entry
--	 in the qualifiers array, there is no operand for them in the
--	 RTL pattern.  */
--      int op_num = insn_data[d->code].n_operands - 1;
--      int arg_num = d->qualifiers[0] & qualifier_void
--		      ? op_num + 1
--		      : op_num;
--      tree return_type = void_type_node, args = void_list_node;
--      tree eltype;
--
--      /* Build a function type directly from the insn_data for this
--	 builtin.  The build_function_type () function takes care of
--	 removing duplicates for us.  */
--      for (; op_num >= 0; arg_num--, op_num--)
--	{
--	  machine_mode op_mode = insn_data[d->code].operand[op_num].mode;
--	  enum arm_type_qualifiers qualifiers = d->qualifiers[arg_num];
--
--	  if (qualifiers & qualifier_unsigned)
--	    {
--	      type_signature[arg_num] = 'u';
--	      print_type_signature_p = true;
--	    }
--	  else if (qualifiers & qualifier_poly)
--	    {
--	      type_signature[arg_num] = 'p';
--	      print_type_signature_p = true;
--	    }
--	  else
--	    type_signature[arg_num] = 's';
--
--	  /* Skip an internal operand for vget_{low, high}.  */
--	  if (qualifiers & qualifier_internal)
--	    continue;
--
--	  /* Some builtins have different user-facing types
--	     for certain arguments, encoded in d->mode.  */
--	  if (qualifiers & qualifier_map_mode)
--	      op_mode = d->mode;
--
--	  /* For pointers, we want a pointer to the basic type
--	     of the vector.  */
--	  if (qualifiers & qualifier_pointer && VECTOR_MODE_P (op_mode))
--	    op_mode = GET_MODE_INNER (op_mode);
--
--	  eltype = arm_simd_builtin_type
--		     (op_mode,
--		      (qualifiers & qualifier_unsigned) != 0,
--		      (qualifiers & qualifier_poly) != 0);
--	  gcc_assert (eltype != NULL);
--
--	  /* Add qualifiers.  */
--	  if (qualifiers & qualifier_const)
--	    eltype = build_qualified_type (eltype, TYPE_QUAL_CONST);
--
--	  if (qualifiers & qualifier_pointer)
--	      eltype = build_pointer_type (eltype);
--
--	  /* If we have reached arg_num == 0, we are at a non-void
--	     return type.  Otherwise, we are still processing
--	     arguments.  */
--	  if (arg_num == 0)
--	    return_type = eltype;
--	  else
--	    args = tree_cons (NULL_TREE, eltype, args);
--	}
--
--      ftype = build_function_type (return_type, args);
-+      arm_init_neon_builtin (fcode, d);
-+    }
-+}
- 
--      gcc_assert (ftype != NULL);
-+/* Set up all the scalar floating point builtins.  */
- 
--      if (print_type_signature_p)
--	snprintf (namebuf, sizeof (namebuf), "__builtin_neon_%s_%s",
--		  d->name, type_signature);
--      else
--	snprintf (namebuf, sizeof (namebuf), "__builtin_neon_%s",
--		  d->name);
-+static void
-+arm_init_vfp_builtins (void)
-+{
-+  unsigned int i, fcode = ARM_BUILTIN_VFP_PATTERN_START;
- 
--      fndecl = add_builtin_function (namebuf, ftype, fcode, BUILT_IN_MD,
--				     NULL, NULL_TREE);
--      arm_builtin_decls[fcode] = fndecl;
-+  for (i = 0; i < ARRAY_SIZE (vfp_builtin_data); i++, fcode++)
-+    {
-+      neon_builtin_datum *d = &vfp_builtin_data[i];
-+      arm_init_neon_builtin (fcode, d);
-     }
- }
- 
-@@ -1768,14 +1812,14 @@ arm_init_builtins (void)
-   if (TARGET_HARD_FLOAT)
-     {
-       arm_init_neon_builtins ();
--
-+      arm_init_vfp_builtins ();
-       arm_init_crypto_builtins ();
-     }
- 
-   if (TARGET_CRC32)
-     arm_init_crc32_builtins ();
- 
--  if (TARGET_VFP && TARGET_HARD_FLOAT)
-+  if (TARGET_HARD_FLOAT)
-     {
-       tree ftype_set_fpscr
- 	= build_function_type_list (void_type_node, unsigned_type_node, NULL);
-@@ -1789,6 +1833,17 @@ arm_init_builtins (void)
- 	= add_builtin_function ("__builtin_arm_stfscr", ftype_set_fpscr,
- 				ARM_BUILTIN_SET_FPSCR, BUILT_IN_MD, NULL, NULL_TREE);
-     }
-+
-+  if (use_cmse)
-+    {
-+      tree ftype_cmse_nonsecure_caller
-+	= build_function_type_list (unsigned_type_node, NULL);
-+      arm_builtin_decls[ARM_BUILTIN_CMSE_NONSECURE_CALLER]
-+	= add_builtin_function ("__builtin_arm_cmse_nonsecure_caller",
-+				ftype_cmse_nonsecure_caller,
-+				ARM_BUILTIN_CMSE_NONSECURE_CALLER, BUILT_IN_MD,
-+				NULL, NULL_TREE);
-+    }
- }
- 
- /* Return the ARM builtin for CODE.  */
-@@ -2211,40 +2266,16 @@ constant_arg:
-   return target;
- }
- 
--/* Expand a Neon builtin, i.e. those registered only if TARGET_NEON holds.
--   Most of these are "special" because they don't have symbolic
--   constants defined per-instruction or per instruction-variant. Instead, the
--   required info is looked up in the table neon_builtin_data.  */
-+/* Expand a neon builtin.  This is also used for vfp builtins, which behave in
-+   the same way.  These builtins are "special" because they don't have symbolic
-+   constants defined per-instruction or per instruction-variant.  Instead, the
-+   required info is looked up in the NEON_BUILTIN_DATA record that is passed
-+   into the function.  */
-+
- static rtx
--arm_expand_neon_builtin (int fcode, tree exp, rtx target)
-+arm_expand_neon_builtin_1 (int fcode, tree exp, rtx target,
-+			   neon_builtin_datum *d)
- {
--  /* Check in the context of the function making the call whether the
--     builtin is supported.  */
--  if (! TARGET_NEON)
--    {
--      fatal_error (input_location,
--		   "You must enable NEON instructions (e.g. -mfloat-abi=softfp -mfpu=neon) to use these intrinsics.");
--      return const0_rtx;
--    }
--
--  if (fcode == ARM_BUILTIN_NEON_LANE_CHECK)
--    {
--      /* Builtin is only to check bounds of the lane passed to some intrinsics
--	 that are implemented with gcc vector extensions in arm_neon.h.  */
--
--      tree nlanes = CALL_EXPR_ARG (exp, 0);
--      gcc_assert (TREE_CODE (nlanes) == INTEGER_CST);
--      rtx lane_idx = expand_normal (CALL_EXPR_ARG (exp, 1));
--      if (CONST_INT_P (lane_idx))
--	neon_lane_bounds (lane_idx, 0, TREE_INT_CST_LOW (nlanes), exp);
--      else
--	error ("%Klane index must be a constant immediate", exp);
--      /* Don't generate any RTL.  */
--      return const0_rtx;
--    }
--
--  neon_builtin_datum *d =
--		&neon_builtin_data[fcode - ARM_BUILTIN_NEON_PATTERN_START];
-   enum insn_code icode = d->code;
-   builtin_arg args[SIMD_MAX_BUILTIN_ARGS + 1];
-   int num_args = insn_data[d->code].n_operands;
-@@ -2260,8 +2291,8 @@ arm_expand_neon_builtin (int fcode, tree exp, rtx target)
-       /* We have four arrays of data, each indexed in a different fashion.
- 	 qualifiers - element 0 always describes the function return type.
- 	 operands - element 0 is either the operand for return value (if
--	   the function has a non-void return type) or the operand for the
--	   first argument.
-+	 the function has a non-void return type) or the operand for the
-+	 first argument.
- 	 expr_args - element 0 always holds the first argument.
- 	 args - element 0 is always used for the return type.  */
-       int qualifiers_k = k;
-@@ -2283,7 +2314,7 @@ arm_expand_neon_builtin (int fcode, tree exp, rtx target)
- 	  bool op_const_int_p =
- 	    (CONST_INT_P (arg)
- 	     && (*insn_data[icode].operand[operands_k].predicate)
--		(arg, insn_data[icode].operand[operands_k].mode));
-+	     (arg, insn_data[icode].operand[operands_k].mode));
- 	  args[k] = op_const_int_p ? NEON_ARG_CONSTANT : NEON_ARG_COPY_TO_REG;
- 	}
-       else if (d->qualifiers[qualifiers_k] & qualifier_pointer)
-@@ -2296,8 +2327,68 @@ arm_expand_neon_builtin (int fcode, tree exp, rtx target)
-   /* The interface to arm_expand_neon_args expects a 0 if
-      the function is void, and a 1 if it is not.  */
-   return arm_expand_neon_args
--	  (target, d->mode, fcode, icode, !is_void, exp,
--	   &args[1]);
-+    (target, d->mode, fcode, icode, !is_void, exp,
-+     &args[1]);
-+}
-+
-+/* Expand a Neon builtin, i.e. those registered only if TARGET_NEON holds.
-+   Most of these are "special" because they don't have symbolic
-+   constants defined per-instruction or per instruction-variant.  Instead, the
-+   required info is looked up in the table neon_builtin_data.  */
-+
-+static rtx
-+arm_expand_neon_builtin (int fcode, tree exp, rtx target)
-+{
-+  if (fcode >= ARM_BUILTIN_NEON_BASE && ! TARGET_NEON)
-+    {
-+      fatal_error (input_location,
-+		   "You must enable NEON instructions"
-+		   " (e.g. -mfloat-abi=softfp -mfpu=neon)"
-+		   " to use these intrinsics.");
-+      return const0_rtx;
-+    }
-+
-+  if (fcode == ARM_BUILTIN_NEON_LANE_CHECK)
-+    {
-+      /* Builtin is only to check bounds of the lane passed to some intrinsics
-+	 that are implemented with gcc vector extensions in arm_neon.h.  */
-+
-+      tree nlanes = CALL_EXPR_ARG (exp, 0);
-+      gcc_assert (TREE_CODE (nlanes) == INTEGER_CST);
-+      rtx lane_idx = expand_normal (CALL_EXPR_ARG (exp, 1));
-+      if (CONST_INT_P (lane_idx))
-+	neon_lane_bounds (lane_idx, 0, TREE_INT_CST_LOW (nlanes), exp);
-+      else
-+	error ("%Klane index must be a constant immediate", exp);
-+      /* Don't generate any RTL.  */
-+      return const0_rtx;
-+    }
-+
-+  neon_builtin_datum *d
-+    = &neon_builtin_data[fcode - ARM_BUILTIN_NEON_PATTERN_START];
-+
-+  return arm_expand_neon_builtin_1 (fcode, exp, target, d);
-+}
-+
-+/* Expand a VFP builtin.  These builtins are treated like
-+   neon builtins except that the data is looked up in table
-+   VFP_BUILTIN_DATA.  */
-+
-+static rtx
-+arm_expand_vfp_builtin (int fcode, tree exp, rtx target)
-+{
-+  if (fcode >= ARM_BUILTIN_VFP_BASE && ! TARGET_HARD_FLOAT)
-+    {
-+      fatal_error (input_location,
-+		   "You must enable VFP instructions"
-+		   " to use these intrinsics.");
-+      return const0_rtx;
-+    }
-+
-+  neon_builtin_datum *d
-+    = &vfp_builtin_data[fcode - ARM_BUILTIN_VFP_PATTERN_START];
-+
-+  return arm_expand_neon_builtin_1 (fcode, exp, target, d);
- }
- 
- /* Expand an expression EXP that calls a built-in function,
-@@ -2337,13 +2428,18 @@ arm_expand_builtin (tree exp,
-   if (fcode >= ARM_BUILTIN_NEON_BASE)
-     return arm_expand_neon_builtin (fcode, exp, target);
- 
-+  if (fcode >= ARM_BUILTIN_VFP_BASE)
-+    return arm_expand_vfp_builtin (fcode, exp, target);
-+
-   /* Check in the context of the function making the call whether the
-      builtin is supported.  */
-   if (fcode >= ARM_BUILTIN_CRYPTO_BASE
-       && (!TARGET_CRYPTO || !TARGET_HARD_FLOAT))
-     {
-       fatal_error (input_location,
--		   "You must enable crypto intrinsics (e.g. include -mfloat-abi=softfp -mfpu=crypto-neon...) to use these intrinsics.");
-+		   "You must enable crypto instructions"
-+		   " (e.g. include -mfloat-abi=softfp -mfpu=crypto-neon...)"
-+		   " to use these intrinsics.");
-       return const0_rtx;
-     }
- 
-@@ -2368,6 +2464,12 @@ arm_expand_builtin (tree exp,
-       emit_insn (pat);
-       return target;
- 
-+    case ARM_BUILTIN_CMSE_NONSECURE_CALLER:
-+      target = gen_reg_rtx (SImode);
-+      op0 = arm_return_addr (0, NULL_RTX);
-+      emit_insn (gen_addsi3 (target, op0, const1_rtx));
-+      return target;
-+
-     case ARM_BUILTIN_TEXTRMSB:
-     case ARM_BUILTIN_TEXTRMUB:
-     case ARM_BUILTIN_TEXTRMSH:
-@@ -2995,7 +3097,7 @@ arm_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
-   tree new_fenv_var, reload_fenv, restore_fnenv;
-   tree update_call, atomic_feraiseexcept, hold_fnclex;
- 
--  if (!TARGET_VFP || !TARGET_HARD_FLOAT)
-+  if (!TARGET_HARD_FLOAT)
-     return;
- 
-   /* Generate the equivalent of :
---- a/src/gcc/config/arm/arm-c.c
-+++ b/src/gcc/config/arm/arm-c.c
-@@ -76,6 +76,14 @@ arm_cpu_builtins (struct cpp_reader* pfile)
- 
-   def_or_undef_macro (pfile, "__ARM_32BIT_STATE", TARGET_32BIT);
- 
-+  if (arm_arch8 && !arm_arch_notm)
-+    {
-+      if (arm_arch_cmse && use_cmse)
-+	builtin_define_with_int_value ("__ARM_FEATURE_CMSE", 3);
-+      else
-+	builtin_define ("__ARM_FEATURE_CMSE");
-+    }
-+
-   if (TARGET_ARM_FEATURE_LDREX)
-     builtin_define_with_int_value ("__ARM_FEATURE_LDREX",
- 				   TARGET_ARM_FEATURE_LDREX);
-@@ -86,6 +94,9 @@ arm_cpu_builtins (struct cpp_reader* pfile)
- 		      ((TARGET_ARM_ARCH >= 5 && !TARGET_THUMB)
- 		       || TARGET_ARM_ARCH_ISA_THUMB >=2));
- 
-+  def_or_undef_macro (pfile, "__ARM_FEATURE_NUMERIC_MAXMIN",
-+		      TARGET_ARM_ARCH >= 8 && TARGET_NEON && TARGET_FPU_ARMV8);
-+
-   def_or_undef_macro (pfile, "__ARM_FEATURE_SIMD32", TARGET_INT_SIMD);
- 
-   builtin_define_with_int_value ("__ARM_SIZEOF_MINIMAL_ENUM",
-@@ -128,17 +139,24 @@ arm_cpu_builtins (struct cpp_reader* pfile)
-   if (TARGET_SOFT_FLOAT)
-     builtin_define ("__SOFTFP__");
- 
--  def_or_undef_macro (pfile, "__VFP_FP__", TARGET_VFP);
-+  builtin_define ("__VFP_FP__");
- 
-   if (TARGET_ARM_FP)
-     builtin_define_with_int_value ("__ARM_FP", TARGET_ARM_FP);
-   else
-     cpp_undef (pfile, "__ARM_FP");
- 
--  if (arm_fp16_format == ARM_FP16_FORMAT_IEEE)
--    builtin_define ("__ARM_FP16_FORMAT_IEEE");
--  if (arm_fp16_format == ARM_FP16_FORMAT_ALTERNATIVE)
--    builtin_define ("__ARM_FP16_FORMAT_ALTERNATIVE");
-+  def_or_undef_macro (pfile, "__ARM_FP16_FORMAT_IEEE",
-+		      arm_fp16_format == ARM_FP16_FORMAT_IEEE);
-+  def_or_undef_macro (pfile, "__ARM_FP16_FORMAT_ALTERNATIVE",
-+		      arm_fp16_format == ARM_FP16_FORMAT_ALTERNATIVE);
-+  def_or_undef_macro (pfile, "__ARM_FP16_ARGS",
-+		      arm_fp16_format != ARM_FP16_FORMAT_NONE);
-+
-+  def_or_undef_macro (pfile, "__ARM_FEATURE_FP16_SCALAR_ARITHMETIC",
-+		      TARGET_VFP_FP16INST);
-+  def_or_undef_macro (pfile, "__ARM_FEATURE_FP16_VECTOR_ARITHMETIC",
-+		      TARGET_NEON_FP16INST);
- 
-   def_or_undef_macro (pfile, "__ARM_FEATURE_FMA", TARGET_FMA);
-   def_or_undef_macro (pfile, "__ARM_NEON__", TARGET_NEON);
---- a/src/gcc/config/arm/arm-cores.def
-+++ b/src/gcc/config/arm/arm-cores.def
-@@ -166,15 +166,21 @@ ARM_CORE("cortex-a15.cortex-a7", cortexa15cortexa7, cortexa7,	7A,	ARM_FSET_MAKE_
- ARM_CORE("cortex-a17.cortex-a7", cortexa17cortexa7, cortexa7,	7A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_THUMB_DIV | FL_ARM_DIV | FL_FOR_ARCH7A), cortex_a12)
- 
- /* V8 Architecture Processors */
-+ARM_CORE("cortex-m23",	cortexm23, cortexm23,	8M_BASE, ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_FOR_ARCH8M_BASE), v6m)
- ARM_CORE("cortex-a32",	cortexa32, cortexa53,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a35)
-+ARM_CORE("cortex-m33",	cortexm33, cortexm33,	8M_MAIN, ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_ARCH7EM | FL_FOR_ARCH8M_MAIN), v7m)
- ARM_CORE("cortex-a35",	cortexa35, cortexa53,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a35)
- ARM_CORE("cortex-a53",	cortexa53, cortexa53,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a53)
- ARM_CORE("cortex-a57",	cortexa57, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
- ARM_CORE("cortex-a72",	cortexa72, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
-+ARM_CORE("cortex-a73",	cortexa73, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a73)
- ARM_CORE("exynos-m1",	exynosm1,  exynosm1,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), exynosm1)
--ARM_CORE("qdf24xx",	qdf24xx,   cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
-+ARM_CORE("qdf24xx",	qdf24xx,   cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), qdf24xx)
- ARM_CORE("xgene1",      xgene1,    xgene1,      8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_FOR_ARCH8A),            xgene1)
- 
- /* V8 big.LITTLE implementations */
- ARM_CORE("cortex-a57.cortex-a53", cortexa57cortexa53, cortexa53, 8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
- ARM_CORE("cortex-a72.cortex-a53", cortexa72cortexa53, cortexa53, 8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
-+ARM_CORE("cortex-a73.cortex-a35", cortexa73cortexa35, cortexa53, 8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a73)
-+ARM_CORE("cortex-a73.cortex-a53", cortexa73cortexa53, cortexa53, 8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a73)
-+
---- /dev/null
-+++ b/src/gcc/config/arm/arm-flags.h
-@@ -0,0 +1,212 @@
-+/* Flags used to identify the presence of processor capabilities.
-+
-+   Copyright (C) 2016 Free Software Foundation, Inc.
-+   Contributed by ARM Ltd.
-+
-+   This file is part of GCC.
-+
-+   GCC is free software; you can redistribute it and/or modify it
-+   under the terms of the GNU General Public License as published
-+   by the Free Software Foundation; either version 3, or (at your
-+   option) any later version.
-+
-+   GCC is distributed in the hope that it will be useful, but WITHOUT
-+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-+   License for more details.
-+
-+   You should have received a copy of the GNU General Public License
-+   along with GCC; see the file COPYING3.  If not see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef GCC_ARM_FLAGS_H
-+#define GCC_ARM_FLAGS_H
-+
-+/* Flags used to identify the presence of processor capabilities.  */
-+
-+/* Bit values used to identify processor capabilities.  */
-+#define FL_NONE	      (0U)		/* No flags.  */
-+#define FL_ANY	      (0xffffffffU)	/* All flags.  */
-+#define FL_CO_PROC    (1U << 0)		/* Has external co-processor bus.  */
-+#define FL_ARCH3M     (1U << 1)		/* Extended multiply.  */
-+#define FL_MODE26     (1U << 2)		/* 26-bit mode support.  */
-+#define FL_MODE32     (1U << 3)		/* 32-bit mode support.  */
-+#define FL_ARCH4      (1U << 4)		/* Architecture rel 4.  */
-+#define FL_ARCH5      (1U << 5)		/* Architecture rel 5.  */
-+#define FL_THUMB      (1U << 6)		/* Thumb aware.  */
-+#define FL_LDSCHED    (1U << 7)		/* Load scheduling necessary.  */
-+#define FL_STRONG     (1U << 8)		/* StrongARM.  */
-+#define FL_ARCH5E     (1U << 9)		/* DSP extensions to v5.  */
-+#define FL_XSCALE     (1U << 10)	/* XScale.  */
-+/* spare	      (1U << 11) */
-+#define FL_ARCH6      (1U << 12)	/* Architecture rel 6.  Adds
-+					   media instructions.  */
-+#define FL_VFPV2      (1U << 13)	/* Vector Floating Point V2.  */
-+#define FL_WBUF	      (1U << 14)	/* Schedule for write buffer ops.
-+					   Note: ARM6 & 7 derivatives only.  */
-+#define FL_ARCH6K     (1U << 15)	/* Architecture rel 6 K extensions.  */
-+#define FL_THUMB2     (1U << 16)	/* Thumb-2.  */
-+#define FL_NOTM	      (1U << 17)	/* Instructions not present in the 'M'
-+					   profile.  */
-+#define FL_THUMB_DIV  (1U << 18)	/* Hardware divide (Thumb mode).  */
-+#define FL_VFPV3      (1U << 19)	/* Vector Floating Point V3.  */
-+#define FL_NEON       (1U << 20)	/* Neon instructions.  */
-+#define FL_ARCH7EM    (1U << 21)	/* Instructions present in the ARMv7E-M
-+					   architecture.  */
-+#define FL_ARCH7      (1U << 22)	/* Architecture 7.  */
-+#define FL_ARM_DIV    (1U << 23)	/* Hardware divide (ARM mode).  */
-+#define FL_ARCH8      (1U << 24)	/* Architecture 8.  */
-+#define FL_CRC32      (1U << 25)	/* ARMv8 CRC32 instructions.  */
-+#define FL_SMALLMUL   (1U << 26)	/* Small multiply supported.  */
-+#define FL_NO_VOLATILE_CE  (1U << 27)	/* No volatile memory in IT block.  */
-+
-+#define FL_IWMMXT     (1U << 29)	/* XScale v2 or "Intel Wireless MMX
-+					   technology".  */
-+#define FL_IWMMXT2    (1U << 30)	/* "Intel Wireless MMX2
-+					    technology".  */
-+#define FL_ARCH6KZ    (1U << 31)	/* ARMv6KZ architecture.  */
-+
-+#define FL2_ARCH8_1   (1U << 0)		/* Architecture 8.1.  */
-+#define FL2_ARCH8_2   (1U << 1)		/* Architecture 8.2.  */
-+#define FL2_FP16INST  (1U << 2)		/* FP16 Instructions for ARMv8.2 and
-+					   later.  */
-+#define FL2_CMSE      (1U << 3)		/* ARMv8-M Security Extensions.  */
-+
-+/* Flags that only effect tuning, not available instructions.  */
-+#define FL_TUNE		(FL_WBUF | FL_VFPV2 | FL_STRONG | FL_LDSCHED \
-+			 | FL_CO_PROC)
-+
-+#define FL_FOR_ARCH2		FL_NOTM
-+#define FL_FOR_ARCH3		(FL_FOR_ARCH2 | FL_MODE32)
-+#define FL_FOR_ARCH3M		(FL_FOR_ARCH3 | FL_ARCH3M)
-+#define FL_FOR_ARCH4		(FL_FOR_ARCH3M | FL_ARCH4)
-+#define FL_FOR_ARCH4T		(FL_FOR_ARCH4 | FL_THUMB)
-+#define FL_FOR_ARCH5		(FL_FOR_ARCH4 | FL_ARCH5)
-+#define FL_FOR_ARCH5T		(FL_FOR_ARCH5 | FL_THUMB)
-+#define FL_FOR_ARCH5E		(FL_FOR_ARCH5 | FL_ARCH5E)
-+#define FL_FOR_ARCH5TE		(FL_FOR_ARCH5E | FL_THUMB)
-+#define FL_FOR_ARCH5TEJ		FL_FOR_ARCH5TE
-+#define FL_FOR_ARCH6		(FL_FOR_ARCH5TE | FL_ARCH6)
-+#define FL_FOR_ARCH6J		FL_FOR_ARCH6
-+#define FL_FOR_ARCH6K		(FL_FOR_ARCH6 | FL_ARCH6K)
-+#define FL_FOR_ARCH6Z		FL_FOR_ARCH6
-+#define FL_FOR_ARCH6ZK		FL_FOR_ARCH6K
-+#define FL_FOR_ARCH6KZ		(FL_FOR_ARCH6K | FL_ARCH6KZ)
-+#define FL_FOR_ARCH6T2		(FL_FOR_ARCH6 | FL_THUMB2)
-+#define FL_FOR_ARCH6M		(FL_FOR_ARCH6 & ~FL_NOTM)
-+#define FL_FOR_ARCH7		((FL_FOR_ARCH6T2 & ~FL_NOTM) | FL_ARCH7)
-+#define FL_FOR_ARCH7A		(FL_FOR_ARCH7 | FL_NOTM | FL_ARCH6K)
-+#define FL_FOR_ARCH7VE		(FL_FOR_ARCH7A | FL_THUMB_DIV | FL_ARM_DIV)
-+#define FL_FOR_ARCH7R		(FL_FOR_ARCH7A | FL_THUMB_DIV)
-+#define FL_FOR_ARCH7M		(FL_FOR_ARCH7 | FL_THUMB_DIV)
-+#define FL_FOR_ARCH7EM		(FL_FOR_ARCH7M | FL_ARCH7EM)
-+#define FL_FOR_ARCH8A		(FL_FOR_ARCH7VE | FL_ARCH8)
-+#define FL2_FOR_ARCH8_1A	FL2_ARCH8_1
-+#define FL2_FOR_ARCH8_2A	(FL2_FOR_ARCH8_1A | FL2_ARCH8_2)
-+#define FL_FOR_ARCH8M_BASE	(FL_FOR_ARCH6M | FL_ARCH8 | FL_THUMB_DIV)
-+#define FL_FOR_ARCH8M_MAIN	(FL_FOR_ARCH7M | FL_ARCH8)
-+
-+/* There are too many feature bits to fit in a single word so the set of cpu and
-+   fpu capabilities is a structure.  A feature set is created and manipulated
-+   with the ARM_FSET macros.  */
-+
-+typedef struct
-+{
-+  unsigned cpu[2];
-+} arm_feature_set;
-+
-+
-+/* Initialize a feature set.  */
-+
-+#define ARM_FSET_MAKE(CPU1,CPU2) { { (CPU1), (CPU2) } }
-+
-+#define ARM_FSET_MAKE_CPU1(CPU1) ARM_FSET_MAKE ((CPU1), (FL_NONE))
-+#define ARM_FSET_MAKE_CPU2(CPU2) ARM_FSET_MAKE ((FL_NONE), (CPU2))
-+
-+/* Accessors.  */
-+
-+#define ARM_FSET_CPU1(S) ((S).cpu[0])
-+#define ARM_FSET_CPU2(S) ((S).cpu[1])
-+
-+/* Useful combinations.  */
-+
-+#define ARM_FSET_EMPTY ARM_FSET_MAKE (FL_NONE, FL_NONE)
-+#define ARM_FSET_ANY ARM_FSET_MAKE (FL_ANY, FL_ANY)
-+
-+/* Tests for a specific CPU feature.  */
-+
-+#define ARM_FSET_HAS_CPU1(A, F)  \
-+  (((A).cpu[0] & ((unsigned long)(F))) == ((unsigned long)(F)))
-+#define ARM_FSET_HAS_CPU2(A, F)  \
-+  (((A).cpu[1] & ((unsigned long)(F))) == ((unsigned long)(F)))
-+#define ARM_FSET_HAS_CPU(A, F1, F2)				\
-+  (ARM_FSET_HAS_CPU1 ((A), (F1)) && ARM_FSET_HAS_CPU2 ((A), (F2)))
-+
-+/* Add a feature to a feature set.  */
-+
-+#define ARM_FSET_ADD_CPU1(DST, F)		\
-+  do {						\
-+    (DST).cpu[0] |= (F);			\
-+  } while (0)
-+
-+#define ARM_FSET_ADD_CPU2(DST, F)		\
-+  do {						\
-+    (DST).cpu[1] |= (F);			\
-+  } while (0)
-+
-+/* Remove a feature from a feature set.  */
-+
-+#define ARM_FSET_DEL_CPU1(DST, F)		\
-+  do {						\
-+    (DST).cpu[0] &= ~(F);			\
-+  } while (0)
-+
-+#define ARM_FSET_DEL_CPU2(DST, F)		\
-+  do {						\
-+    (DST).cpu[1] &= ~(F);			\
-+  } while (0)
-+
-+/* Union of feature sets.  */
-+
-+#define ARM_FSET_UNION(DST,F1,F2)		\
-+  do {						\
-+    (DST).cpu[0] = (F1).cpu[0] | (F2).cpu[0];	\
-+    (DST).cpu[1] = (F1).cpu[1] | (F2).cpu[1];	\
-+  } while (0)
-+
-+/* Intersection of feature sets.  */
-+
-+#define ARM_FSET_INTER(DST,F1,F2)		\
-+  do {						\
-+    (DST).cpu[0] = (F1).cpu[0] & (F2).cpu[0];	\
-+    (DST).cpu[1] = (F1).cpu[1] & (F2).cpu[1];	\
-+  } while (0)
-+
-+/* Exclusive disjunction.  */
-+
-+#define ARM_FSET_XOR(DST,F1,F2)				\
-+  do {							\
-+    (DST).cpu[0] = (F1).cpu[0] ^ (F2).cpu[0];		\
-+    (DST).cpu[1] = (F1).cpu[1] ^ (F2).cpu[1];		\
-+  } while (0)
-+
-+/* Difference of feature sets: F1 excluding the elements of F2.  */
-+
-+#define ARM_FSET_EXCLUDE(DST,F1,F2)		\
-+  do {						\
-+    (DST).cpu[0] = (F1).cpu[0] & ~(F2).cpu[0];	\
-+    (DST).cpu[1] = (F1).cpu[1] & ~(F2).cpu[1];	\
-+  } while (0)
-+
-+/* Test for an empty feature set.  */
-+
-+#define ARM_FSET_IS_EMPTY(A)		\
-+  (!((A).cpu[0]) && !((A).cpu[1]))
-+
-+/* Tests whether the cpu features of A are a subset of B.  */
-+
-+#define ARM_FSET_CPU_SUBSET(A,B)					\
-+  ((((A).cpu[0] & (B).cpu[0]) == (A).cpu[0])				\
-+   && (((A).cpu[1] & (B).cpu[1]) == (A).cpu[1]))
-+
-+#endif /* GCC_ARM_FLAGS_H */
---- a/src/gcc/config/arm/arm-fpus.def
-+++ b/src/gcc/config/arm/arm-fpus.def
-@@ -19,30 +19,31 @@
- 
- /* Before using #include to read this file, define a macro:
- 
--      ARM_FPU(NAME, MODEL, REV, VFP_REGS, FEATURES)
-+      ARM_FPU(NAME, REV, VFP_REGS, FEATURES)
- 
-    The arguments are the fields of struct arm_fpu_desc.
- 
-    genopt.sh assumes no whitespace up to the first "," in each entry.  */
- 
--ARM_FPU("vfp",		ARM_FP_MODEL_VFP, 2, VFP_REG_D16, FPU_FL_NONE)
--ARM_FPU("vfpv3",	ARM_FP_MODEL_VFP, 3, VFP_REG_D32, FPU_FL_NONE)
--ARM_FPU("vfpv3-fp16",	ARM_FP_MODEL_VFP, 3, VFP_REG_D32, FPU_FL_FP16)
--ARM_FPU("vfpv3-d16",	ARM_FP_MODEL_VFP, 3, VFP_REG_D16, FPU_FL_NONE)
--ARM_FPU("vfpv3-d16-fp16",	ARM_FP_MODEL_VFP, 3, VFP_REG_D16, FPU_FL_FP16)
--ARM_FPU("vfpv3xd",	ARM_FP_MODEL_VFP, 3, VFP_REG_SINGLE, FPU_FL_NONE)
--ARM_FPU("vfpv3xd-fp16",	ARM_FP_MODEL_VFP, 3, VFP_REG_SINGLE, FPU_FL_FP16)
--ARM_FPU("neon",		ARM_FP_MODEL_VFP, 3, VFP_REG_D32, FPU_FL_NEON)
--ARM_FPU("neon-fp16",	ARM_FP_MODEL_VFP, 3, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16)
--ARM_FPU("vfpv4",	ARM_FP_MODEL_VFP, 4, VFP_REG_D32, FPU_FL_FP16)
--ARM_FPU("vfpv4-d16",	ARM_FP_MODEL_VFP, 4, VFP_REG_D16, FPU_FL_FP16)
--ARM_FPU("fpv4-sp-d16",	ARM_FP_MODEL_VFP, 4, VFP_REG_SINGLE, FPU_FL_FP16)
--ARM_FPU("fpv5-sp-d16",	ARM_FP_MODEL_VFP, 5, VFP_REG_SINGLE, FPU_FL_FP16)
--ARM_FPU("fpv5-d16",	ARM_FP_MODEL_VFP, 5, VFP_REG_D16, FPU_FL_FP16)
--ARM_FPU("neon-vfpv4",	ARM_FP_MODEL_VFP, 4, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16)
--ARM_FPU("fp-armv8",	ARM_FP_MODEL_VFP, 8, VFP_REG_D32, FPU_FL_FP16)
--ARM_FPU("neon-fp-armv8",ARM_FP_MODEL_VFP, 8, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16)
--ARM_FPU("crypto-neon-fp-armv8",
--			ARM_FP_MODEL_VFP, 8, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16 | FPU_FL_CRYPTO)
-+ARM_FPU("vfp",		2, VFP_REG_D16, FPU_FL_NONE)
-+ARM_FPU("vfpv2",	2, VFP_REG_D16, FPU_FL_NONE)
-+ARM_FPU("vfpv3",	3, VFP_REG_D32, FPU_FL_NONE)
-+ARM_FPU("vfpv3-fp16",	3, VFP_REG_D32, FPU_FL_FP16)
-+ARM_FPU("vfpv3-d16",	3, VFP_REG_D16, FPU_FL_NONE)
-+ARM_FPU("vfpv3-d16-fp16", 3, VFP_REG_D16, FPU_FL_FP16)
-+ARM_FPU("vfpv3xd",	3, VFP_REG_SINGLE, FPU_FL_NONE)
-+ARM_FPU("vfpv3xd-fp16",	3, VFP_REG_SINGLE, FPU_FL_FP16)
-+ARM_FPU("neon",		3, VFP_REG_D32, FPU_FL_NEON)
-+ARM_FPU("neon-vfpv3",	3, VFP_REG_D32, FPU_FL_NEON)
-+ARM_FPU("neon-fp16",	3, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16)
-+ARM_FPU("vfpv4",	4, VFP_REG_D32, FPU_FL_FP16)
-+ARM_FPU("vfpv4-d16",	4, VFP_REG_D16, FPU_FL_FP16)
-+ARM_FPU("fpv4-sp-d16",	4, VFP_REG_SINGLE, FPU_FL_FP16)
-+ARM_FPU("fpv5-sp-d16",	5, VFP_REG_SINGLE, FPU_FL_FP16)
-+ARM_FPU("fpv5-d16",	5, VFP_REG_D16, FPU_FL_FP16)
-+ARM_FPU("neon-vfpv4",	4, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16)
-+ARM_FPU("fp-armv8",	8, VFP_REG_D32, FPU_FL_FP16)
-+ARM_FPU("neon-fp-armv8", 8, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16)
-+ARM_FPU("crypto-neon-fp-armv8", 8, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16 | FPU_FL_CRYPTO)
- /* Compatibility aliases.  */
--ARM_FPU("vfp3",		ARM_FP_MODEL_VFP, 3, VFP_REG_D32, FPU_FL_NONE)
-+ARM_FPU("vfp3",		3, VFP_REG_D32, FPU_FL_NONE)
---- a/src/gcc/config/arm/arm-modes.def
-+++ b/src/gcc/config/arm/arm-modes.def
-@@ -59,6 +59,7 @@ CC_MODE (CC_DGEU);
- CC_MODE (CC_DGTU);
- CC_MODE (CC_C);
- CC_MODE (CC_N);
-+CC_MODE (CC_V);
- 
- /* Vector modes.  */
- VECTOR_MODES (INT, 4);        /*            V4QI V2HI */
---- a/src/gcc/config/arm/arm-opts.h
-+++ b/src/gcc/config/arm/arm-opts.h
-@@ -25,6 +25,8 @@
- #ifndef ARM_OPTS_H
- #define ARM_OPTS_H
- 
-+#include "arm-flags.h"
-+
- /* The various ARM cores.  */
- enum processor_type
- {
---- a/src/gcc/config/arm/arm-protos.h
-+++ b/src/gcc/config/arm/arm-protos.h
-@@ -22,6 +22,8 @@
- #ifndef GCC_ARM_PROTOS_H
- #define GCC_ARM_PROTOS_H
- 
-+#include "arm-flags.h"
-+
- extern enum unwind_info_type arm_except_unwind_info (struct gcc_options *);
- extern int use_return_insn (int, rtx);
- extern bool use_simple_return_p (void);
-@@ -31,6 +33,7 @@ extern int arm_volatile_func (void);
- extern void arm_expand_prologue (void);
- extern void arm_expand_epilogue (bool);
- extern void arm_declare_function_name (FILE *, const char *, tree);
-+extern void arm_asm_declare_function_name (FILE *, const char *, tree);
- extern void thumb2_expand_return (bool);
- extern const char *arm_strip_name_encoding (const char *);
- extern void arm_asm_output_labelref (FILE *, const char *);
-@@ -50,8 +53,12 @@ extern tree arm_builtin_decl (unsigned code, bool initialize_p
- 			      ATTRIBUTE_UNUSED);
- extern void arm_init_builtins (void);
- extern void arm_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update);
--
-+extern rtx arm_simd_vect_par_cnst_half (machine_mode mode, bool high);
-+extern bool arm_simd_check_vect_par_cnst_half_p (rtx op, machine_mode mode,
-+						 bool high);
- #ifdef RTX_CODE
-+extern void arm_gen_unlikely_cbranch (enum rtx_code, machine_mode cc_mode,
-+				      rtx label_ref);
- extern bool arm_vector_mode_supported_p (machine_mode);
- extern bool arm_small_register_classes_for_mode_p (machine_mode);
- extern int arm_hard_regno_mode_ok (unsigned int, machine_mode);
-@@ -130,6 +137,7 @@ extern int arm_const_double_inline_cost (rtx);
- extern bool arm_const_double_by_parts (rtx);
- extern bool arm_const_double_by_immediates (rtx);
- extern void arm_emit_call_insn (rtx, rtx, bool);
-+bool detect_cmse_nonsecure_call (tree);
- extern const char *output_call (rtx *);
- void arm_emit_movpair (rtx, rtx);
- extern const char *output_mov_long_double_arm_from_arm (rtx *);
-@@ -161,6 +169,7 @@ extern const char *arm_output_iwmmxt_shift_immediate (const char *, rtx *, bool)
- extern const char *arm_output_iwmmxt_tinsr (rtx *);
- extern unsigned int arm_sync_loop_insns (rtx , rtx *);
- extern int arm_attr_length_push_multi(rtx, rtx);
-+extern int arm_attr_length_pop_multi(rtx *, bool, bool);
- extern void arm_expand_compare_and_swap (rtx op[]);
- extern void arm_split_compare_and_swap (rtx op[]);
- extern void arm_split_atomic_op (enum rtx_code, rtx, rtx, rtx, rtx, rtx, rtx);
-@@ -192,7 +201,6 @@ extern const char *thumb_call_via_reg (rtx);
- extern void thumb_expand_movmemqi (rtx *);
- extern rtx arm_return_addr (int, rtx);
- extern void thumb_reload_out_hi (rtx *);
--extern void thumb_reload_in_hi (rtx *);
- extern void thumb_set_return_address (rtx, rtx);
- extern const char *thumb1_output_casesi (rtx *);
- extern const char *thumb2_output_casesi (rtx *);
-@@ -256,7 +264,6 @@ struct cpu_cost_table;
- 
- struct tune_params
- {
--  bool (*rtx_costs) (rtx, RTX_CODE, RTX_CODE, int *, bool);
-   const struct cpu_cost_table *insn_extra_cost;
-   bool (*sched_adjust_cost) (rtx_insn *, rtx, rtx_insn *, int *);
-   int (*branch_cost) (bool, bool);
-@@ -319,6 +326,7 @@ extern int vfp3_const_double_for_bits (rtx);
- 
- extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
- 					   rtx);
-+extern bool arm_fusion_enabled_p (tune_params::fuse_ops);
- extern bool arm_valid_symbolic_address_p (rtx);
- extern bool arm_validize_comparison (rtx *, rtx *, rtx *);
- #endif /* RTX_CODE */
-@@ -344,184 +352,6 @@ extern void arm_cpu_cpp_builtins (struct cpp_reader *);
- 
- extern bool arm_is_constant_pool_ref (rtx);
- 
--/* Flags used to identify the presence of processor capabilities.  */
--
--/* Bit values used to identify processor capabilities.  */
--#define FL_NONE	      (0)	      /* No flags.  */
--#define FL_ANY	      (0xffffffff)    /* All flags.  */
--#define FL_CO_PROC    (1 << 0)        /* Has external co-processor bus */
--#define FL_ARCH3M     (1 << 1)        /* Extended multiply */
--#define FL_MODE26     (1 << 2)        /* 26-bit mode support */
--#define FL_MODE32     (1 << 3)        /* 32-bit mode support */
--#define FL_ARCH4      (1 << 4)        /* Architecture rel 4 */
--#define FL_ARCH5      (1 << 5)        /* Architecture rel 5 */
--#define FL_THUMB      (1 << 6)        /* Thumb aware */
--#define FL_LDSCHED    (1 << 7)	      /* Load scheduling necessary */
--#define FL_STRONG     (1 << 8)	      /* StrongARM */
--#define FL_ARCH5E     (1 << 9)        /* DSP extensions to v5 */
--#define FL_XSCALE     (1 << 10)	      /* XScale */
--/* spare	      (1 << 11)	*/
--#define FL_ARCH6      (1 << 12)       /* Architecture rel 6.  Adds
--					 media instructions.  */
--#define FL_VFPV2      (1 << 13)       /* Vector Floating Point V2.  */
--#define FL_WBUF	      (1 << 14)	      /* Schedule for write buffer ops.
--					 Note: ARM6 & 7 derivatives only.  */
--#define FL_ARCH6K     (1 << 15)       /* Architecture rel 6 K extensions.  */
--#define FL_THUMB2     (1 << 16)	      /* Thumb-2.  */
--#define FL_NOTM	      (1 << 17)	      /* Instructions not present in the 'M'
--					 profile.  */
--#define FL_THUMB_DIV  (1 << 18)	      /* Hardware divide (Thumb mode).  */
--#define FL_VFPV3      (1 << 19)       /* Vector Floating Point V3.  */
--#define FL_NEON       (1 << 20)       /* Neon instructions.  */
--#define FL_ARCH7EM    (1 << 21)	      /* Instructions present in the ARMv7E-M
--					 architecture.  */
--#define FL_ARCH7      (1 << 22)       /* Architecture 7.  */
--#define FL_ARM_DIV    (1 << 23)	      /* Hardware divide (ARM mode).  */
--#define FL_ARCH8      (1 << 24)       /* Architecture 8.  */
--#define FL_CRC32      (1 << 25)	      /* ARMv8 CRC32 instructions.  */
--
--#define FL_SMALLMUL   (1 << 26)       /* Small multiply supported.  */
--#define FL_NO_VOLATILE_CE   (1 << 27) /* No volatile memory in IT block.  */
--
--#define FL_IWMMXT     (1 << 29)	      /* XScale v2 or "Intel Wireless MMX technology".  */
--#define FL_IWMMXT2    (1 << 30)       /* "Intel Wireless MMX2 technology".  */
--#define FL_ARCH6KZ    (1 << 31)       /* ARMv6KZ architecture.  */
--
--#define FL2_ARCH8_1   (1 << 0)	      /* Architecture 8.1.  */
--
--/* Flags that only effect tuning, not available instructions.  */
--#define FL_TUNE		(FL_WBUF | FL_VFPV2 | FL_STRONG | FL_LDSCHED \
--			 | FL_CO_PROC)
--
--#define FL_FOR_ARCH2	FL_NOTM
--#define FL_FOR_ARCH3	(FL_FOR_ARCH2 | FL_MODE32)
--#define FL_FOR_ARCH3M	(FL_FOR_ARCH3 | FL_ARCH3M)
--#define FL_FOR_ARCH4	(FL_FOR_ARCH3M | FL_ARCH4)
--#define FL_FOR_ARCH4T	(FL_FOR_ARCH4 | FL_THUMB)
--#define FL_FOR_ARCH5	(FL_FOR_ARCH4 | FL_ARCH5)
--#define FL_FOR_ARCH5T	(FL_FOR_ARCH5 | FL_THUMB)
--#define FL_FOR_ARCH5E	(FL_FOR_ARCH5 | FL_ARCH5E)
--#define FL_FOR_ARCH5TE	(FL_FOR_ARCH5E | FL_THUMB)
--#define FL_FOR_ARCH5TEJ	FL_FOR_ARCH5TE
--#define FL_FOR_ARCH6	(FL_FOR_ARCH5TE | FL_ARCH6)
--#define FL_FOR_ARCH6J	FL_FOR_ARCH6
--#define FL_FOR_ARCH6K	(FL_FOR_ARCH6 | FL_ARCH6K)
--#define FL_FOR_ARCH6Z	FL_FOR_ARCH6
--#define FL_FOR_ARCH6KZ	(FL_FOR_ARCH6K | FL_ARCH6KZ)
--#define FL_FOR_ARCH6T2	(FL_FOR_ARCH6 | FL_THUMB2)
--#define FL_FOR_ARCH6M	(FL_FOR_ARCH6 & ~FL_NOTM)
--#define FL_FOR_ARCH7	((FL_FOR_ARCH6T2 & ~FL_NOTM) | FL_ARCH7)
--#define FL_FOR_ARCH7A	(FL_FOR_ARCH7 | FL_NOTM | FL_ARCH6K)
--#define FL_FOR_ARCH7VE	(FL_FOR_ARCH7A | FL_THUMB_DIV | FL_ARM_DIV)
--#define FL_FOR_ARCH7R	(FL_FOR_ARCH7A | FL_THUMB_DIV)
--#define FL_FOR_ARCH7M	(FL_FOR_ARCH7 | FL_THUMB_DIV)
--#define FL_FOR_ARCH7EM  (FL_FOR_ARCH7M | FL_ARCH7EM)
--#define FL_FOR_ARCH8A	(FL_FOR_ARCH7VE | FL_ARCH8)
--#define FL2_FOR_ARCH8_1A	FL2_ARCH8_1
--
--/* There are too many feature bits to fit in a single word so the set of cpu and
--   fpu capabilities is a structure.  A feature set is created and manipulated
--   with the ARM_FSET macros.  */
--
--typedef struct
--{
--  unsigned long cpu[2];
--} arm_feature_set;
--
--
--/* Initialize a feature set.  */
--
--#define ARM_FSET_MAKE(CPU1,CPU2) { { (CPU1), (CPU2) } }
--
--#define ARM_FSET_MAKE_CPU1(CPU1) ARM_FSET_MAKE ((CPU1), (FL_NONE))
--#define ARM_FSET_MAKE_CPU2(CPU2) ARM_FSET_MAKE ((FL_NONE), (CPU2))
--
--/* Accessors.  */
--
--#define ARM_FSET_CPU1(S) ((S).cpu[0])
--#define ARM_FSET_CPU2(S) ((S).cpu[1])
--
--/* Useful combinations.  */
--
--#define ARM_FSET_EMPTY ARM_FSET_MAKE (FL_NONE, FL_NONE)
--#define ARM_FSET_ANY ARM_FSET_MAKE (FL_ANY, FL_ANY)
--
--/* Tests for a specific CPU feature.  */
--
--#define ARM_FSET_HAS_CPU1(A, F)  \
--  (((A).cpu[0] & ((unsigned long)(F))) == ((unsigned long)(F)))
--#define ARM_FSET_HAS_CPU2(A, F)  \
--  (((A).cpu[1] & ((unsigned long)(F))) == ((unsigned long)(F)))
--#define ARM_FSET_HAS_CPU(A, F1, F2)				\
--  (ARM_FSET_HAS_CPU1 ((A), (F1)) && ARM_FSET_HAS_CPU2 ((A), (F2)))
--
--/* Add a feature to a feature set.  */
--
--#define ARM_FSET_ADD_CPU1(DST, F)		\
--  do {						\
--    (DST).cpu[0] |= (F);			\
--  } while (0)
--
--#define ARM_FSET_ADD_CPU2(DST, F)		\
--  do {						\
--    (DST).cpu[1] |= (F);			\
--  } while (0)
--
--/* Remove a feature from a feature set.  */
--
--#define ARM_FSET_DEL_CPU1(DST, F)		\
--  do {						\
--    (DST).cpu[0] &= ~(F);			\
--  } while (0)
--
--#define ARM_FSET_DEL_CPU2(DST, F)		\
--  do {						\
--    (DST).cpu[1] &= ~(F);			\
--  } while (0)
--
--/* Union of feature sets.  */
--
--#define ARM_FSET_UNION(DST,F1,F2)		\
--  do {						\
--    (DST).cpu[0] = (F1).cpu[0] | (F2).cpu[0];	\
--    (DST).cpu[1] = (F1).cpu[1] | (F2).cpu[1];	\
--  } while (0)
--
--/* Intersection of feature sets.  */
--
--#define ARM_FSET_INTER(DST,F1,F2)		\
--  do {						\
--    (DST).cpu[0] = (F1).cpu[0] & (F2).cpu[0];	\
--    (DST).cpu[1] = (F1).cpu[1] & (F2).cpu[1];	\
--  } while (0)
--
--/* Exclusive disjunction.  */
--
--#define ARM_FSET_XOR(DST,F1,F2)				\
--  do {							\
--    (DST).cpu[0] = (F1).cpu[0] ^ (F2).cpu[0];		\
--    (DST).cpu[1] = (F1).cpu[1] ^ (F2).cpu[1];		\
--  } while (0)
--
--/* Difference of feature sets: F1 excluding the elements of F2.  */
--
--#define ARM_FSET_EXCLUDE(DST,F1,F2)		\
--  do {						\
--    (DST).cpu[0] = (F1).cpu[0] & ~(F2).cpu[0];	\
--    (DST).cpu[1] = (F1).cpu[1] & ~(F2).cpu[1];	\
--  } while (0)
--
--/* Test for an empty feature set.  */
--
--#define ARM_FSET_IS_EMPTY(A)		\
--  (!((A).cpu[0]) && !((A).cpu[1]))
--
--/* Tests whether the cpu features of A are a subset of B.  */
--
--#define ARM_FSET_CPU_SUBSET(A,B)					\
--  ((((A).cpu[0] & (B).cpu[0]) == (A).cpu[0])				\
--   && (((A).cpu[1] & (B).cpu[1]) == (A).cpu[1]))
--
- /* The bits in this mask specify which
-    instructions we are allowed to generate.  */
- extern arm_feature_set insn_flags;
-@@ -601,6 +431,9 @@ extern int arm_tune_cortex_a9;
-    interworking clean.  */
- extern int arm_cpp_interwork;
- 
-+/* Nonzero if chip supports Thumb 1.  */
-+extern int arm_arch_thumb1;
-+
- /* Nonzero if chip supports Thumb 2.  */
- extern int arm_arch_thumb2;
- 
---- a/src/gcc/config/arm/arm-tables.opt
-+++ b/src/gcc/config/arm/arm-tables.opt
-@@ -307,9 +307,15 @@ EnumValue
- Enum(processor_type) String(cortex-a17.cortex-a7) Value(cortexa17cortexa7)
- 
- EnumValue
-+Enum(processor_type) String(cortex-m23) Value(cortexm23)
-+
-+EnumValue
- Enum(processor_type) String(cortex-a32) Value(cortexa32)
- 
- EnumValue
-+Enum(processor_type) String(cortex-m33) Value(cortexm33)
-+
-+EnumValue
- Enum(processor_type) String(cortex-a35) Value(cortexa35)
- 
- EnumValue
-@@ -322,6 +328,9 @@ EnumValue
- Enum(processor_type) String(cortex-a72) Value(cortexa72)
- 
- EnumValue
-+Enum(processor_type) String(cortex-a73) Value(cortexa73)
-+
-+EnumValue
- Enum(processor_type) String(exynos-m1) Value(exynosm1)
- 
- EnumValue
-@@ -336,6 +345,12 @@ Enum(processor_type) String(cortex-a57.cortex-a53) Value(cortexa57cortexa53)
- EnumValue
- Enum(processor_type) String(cortex-a72.cortex-a53) Value(cortexa72cortexa53)
- 
-+EnumValue
-+Enum(processor_type) String(cortex-a73.cortex-a35) Value(cortexa73cortexa35)
-+
-+EnumValue
-+Enum(processor_type) String(cortex-a73.cortex-a53) Value(cortexa73cortexa53)
-+
- Enum
- Name(arm_arch) Type(int)
- Known ARM architectures (for use with the -march= option):
-@@ -428,10 +443,25 @@ EnumValue
- Enum(arm_arch) String(armv8.1-a+crc) Value(28)
- 
- EnumValue
--Enum(arm_arch) String(iwmmxt) Value(29)
-+Enum(arm_arch) String(armv8.2-a) Value(29)
-+
-+EnumValue
-+Enum(arm_arch) String(armv8.2-a+fp16) Value(30)
- 
- EnumValue
--Enum(arm_arch) String(iwmmxt2) Value(30)
-+Enum(arm_arch) String(armv8-m.base) Value(31)
-+
-+EnumValue
-+Enum(arm_arch) String(armv8-m.main) Value(32)
-+
-+EnumValue
-+Enum(arm_arch) String(armv8-m.main+dsp) Value(33)
-+
-+EnumValue
-+Enum(arm_arch) String(iwmmxt) Value(34)
-+
-+EnumValue
-+Enum(arm_arch) String(iwmmxt2) Value(35)
- 
- Enum
- Name(arm_fpu) Type(int)
-@@ -441,56 +471,62 @@ EnumValue
- Enum(arm_fpu) String(vfp) Value(0)
- 
- EnumValue
--Enum(arm_fpu) String(vfpv3) Value(1)
-+Enum(arm_fpu) String(vfpv2) Value(1)
-+
-+EnumValue
-+Enum(arm_fpu) String(vfpv3) Value(2)
-+
-+EnumValue
-+Enum(arm_fpu) String(vfpv3-fp16) Value(3)
- 
- EnumValue
--Enum(arm_fpu) String(vfpv3-fp16) Value(2)
-+Enum(arm_fpu) String(vfpv3-d16) Value(4)
- 
- EnumValue
--Enum(arm_fpu) String(vfpv3-d16) Value(3)
-+Enum(arm_fpu) String(vfpv3-d16-fp16) Value(5)
- 
- EnumValue
--Enum(arm_fpu) String(vfpv3-d16-fp16) Value(4)
-+Enum(arm_fpu) String(vfpv3xd) Value(6)
- 
- EnumValue
--Enum(arm_fpu) String(vfpv3xd) Value(5)
-+Enum(arm_fpu) String(vfpv3xd-fp16) Value(7)
- 
- EnumValue
--Enum(arm_fpu) String(vfpv3xd-fp16) Value(6)
-+Enum(arm_fpu) String(neon) Value(8)
- 
- EnumValue
--Enum(arm_fpu) String(neon) Value(7)
-+Enum(arm_fpu) String(neon-vfpv3) Value(9)
- 
- EnumValue
--Enum(arm_fpu) String(neon-fp16) Value(8)
-+Enum(arm_fpu) String(neon-fp16) Value(10)
- 
- EnumValue
--Enum(arm_fpu) String(vfpv4) Value(9)
-+Enum(arm_fpu) String(vfpv4) Value(11)
- 
- EnumValue
--Enum(arm_fpu) String(vfpv4-d16) Value(10)
-+Enum(arm_fpu) String(vfpv4-d16) Value(12)
- 
- EnumValue
--Enum(arm_fpu) String(fpv4-sp-d16) Value(11)
-+Enum(arm_fpu) String(fpv4-sp-d16) Value(13)
- 
- EnumValue
--Enum(arm_fpu) String(fpv5-sp-d16) Value(12)
-+Enum(arm_fpu) String(fpv5-sp-d16) Value(14)
- 
- EnumValue
--Enum(arm_fpu) String(fpv5-d16) Value(13)
-+Enum(arm_fpu) String(fpv5-d16) Value(15)
- 
- EnumValue
--Enum(arm_fpu) String(neon-vfpv4) Value(14)
-+Enum(arm_fpu) String(neon-vfpv4) Value(16)
- 
- EnumValue
--Enum(arm_fpu) String(fp-armv8) Value(15)
-+Enum(arm_fpu) String(fp-armv8) Value(17)
- 
- EnumValue
--Enum(arm_fpu) String(neon-fp-armv8) Value(16)
-+Enum(arm_fpu) String(neon-fp-armv8) Value(18)
- 
- EnumValue
--Enum(arm_fpu) String(crypto-neon-fp-armv8) Value(17)
-+Enum(arm_fpu) String(crypto-neon-fp-armv8) Value(19)
- 
- EnumValue
--Enum(arm_fpu) String(vfp3) Value(18)
-+Enum(arm_fpu) String(vfp3) Value(20)
- 
---- a/src/gcc/config/arm/arm-tune.md
-+++ b/src/gcc/config/arm/arm-tune.md
-@@ -32,8 +32,10 @@
- 	cortexr4f,cortexr5,cortexr7,
- 	cortexr8,cortexm7,cortexm4,
- 	cortexm3,marvell_pj4,cortexa15cortexa7,
--	cortexa17cortexa7,cortexa32,cortexa35,
--	cortexa53,cortexa57,cortexa72,
-+	cortexa17cortexa7,cortexm23,cortexa32,
-+	cortexm33,cortexa35,cortexa53,
-+	cortexa57,cortexa72,cortexa73,
- 	exynosm1,qdf24xx,xgene1,
--	cortexa57cortexa53,cortexa72cortexa53"
-+	cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,
-+	cortexa73cortexa53"
- 	(const (symbol_ref "((enum attr_tune) arm_tune)")))
---- a/src/gcc/config/arm/arm.c
-+++ b/src/gcc/config/arm/arm.c
-@@ -27,6 +27,7 @@
- #include "target.h"
- #include "rtl.h"
- #include "tree.h"
-+#include "memmodel.h"
- #include "cfghooks.h"
- #include "df.h"
- #include "tm_p.h"
-@@ -61,6 +62,7 @@
- #include "builtins.h"
- #include "tm-constrs.h"
- #include "rtl-iter.h"
-+#include "gimplify.h"
- 
- /* This file should be included last.  */
- #include "target-def.h"
-@@ -104,7 +106,6 @@ static void arm_print_operand_address (FILE *, machine_mode, rtx);
- static bool arm_print_operand_punct_valid_p (unsigned char code);
- static const char *fp_const_from_val (REAL_VALUE_TYPE *);
- static arm_cc get_arm_condition_code (rtx);
--static HOST_WIDE_INT int_log2 (HOST_WIDE_INT);
- static const char *output_multi_immediate (rtx *, const char *, const char *,
- 					   int, HOST_WIDE_INT);
- static const char *shift_op (rtx, HOST_WIDE_INT *);
-@@ -135,6 +136,8 @@ static tree arm_handle_isr_attribute (tree *, tree, tree, int, bool *);
- #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
- static tree arm_handle_notshared_attribute (tree *, tree, tree, int, bool *);
- #endif
-+static tree arm_handle_cmse_nonsecure_entry (tree *, tree, tree, int, bool *);
-+static tree arm_handle_cmse_nonsecure_call (tree *, tree, tree, int, bool *);
- static void arm_output_function_epilogue (FILE *, HOST_WIDE_INT);
- static void arm_output_function_prologue (FILE *, HOST_WIDE_INT);
- static int arm_comp_type_attributes (const_tree, const_tree);
-@@ -164,12 +167,6 @@ static void arm_output_mi_thunk (FILE *, tree, HOST_WIDE_INT, HOST_WIDE_INT,
- static bool arm_have_conditional_execution (void);
- static bool arm_cannot_force_const_mem (machine_mode, rtx);
- static bool arm_legitimate_constant_p (machine_mode, rtx);
--static bool arm_rtx_costs_1 (rtx, enum rtx_code, int*, bool);
--static bool arm_size_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *);
--static bool arm_slowmul_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
--static bool arm_fastmul_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
--static bool arm_xscale_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
--static bool arm_9e_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
- static bool arm_rtx_costs (rtx, machine_mode, int, int, int *, bool);
- static int arm_address_cost (rtx, machine_mode, addr_space_t, bool);
- static int arm_register_move_cost (machine_mode, reg_class_t, reg_class_t);
-@@ -249,8 +246,6 @@ static void arm_output_dwarf_dtprel (FILE *, int, rtx) ATTRIBUTE_UNUSED;
- static bool arm_output_addr_const_extra (FILE *, rtx);
- static bool arm_allocate_stack_slots_for_args (void);
- static bool arm_warn_func_return (tree);
--static const char *arm_invalid_parameter_type (const_tree t);
--static const char *arm_invalid_return_type (const_tree t);
- static tree arm_promoted_type (const_tree t);
- static tree arm_convert_to_type (tree type, tree expr);
- static bool arm_scalar_mode_supported_p (machine_mode);
-@@ -300,6 +295,9 @@ static void arm_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
- static unsigned HOST_WIDE_INT arm_asan_shadow_offset (void);
- 
- static void arm_sched_fusion_priority (rtx_insn *, int, int *, int*);
-+static bool arm_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT,
-+				     const_tree);
-+
- 
- /* Table of machine attributes.  */
- static const struct attribute_spec arm_attribute_table[] =
-@@ -343,6 +341,11 @@ static const struct attribute_spec arm_attribute_table[] =
-   { "notshared",    0, 0, false, true, false, arm_handle_notshared_attribute,
-     false },
- #endif
-+  /* ARMv8-M Security Extensions support.  */
-+  { "cmse_nonsecure_entry", 0, 0, true, false, false,
-+    arm_handle_cmse_nonsecure_entry, false },
-+  { "cmse_nonsecure_call", 0, 0, true, false, false,
-+    arm_handle_cmse_nonsecure_call, true },
-   { NULL,           0, 0, false, false, false, NULL, false }
- };
- 
-@@ -463,7 +466,7 @@ static const struct attribute_spec arm_attribute_table[] =
- #undef  TARGET_ASM_OUTPUT_MI_THUNK
- #define TARGET_ASM_OUTPUT_MI_THUNK arm_output_mi_thunk
- #undef  TARGET_ASM_CAN_OUTPUT_MI_THUNK
--#define TARGET_ASM_CAN_OUTPUT_MI_THUNK default_can_output_mi_thunk_no_vcall
-+#define TARGET_ASM_CAN_OUTPUT_MI_THUNK arm_can_output_mi_thunk
- 
- #undef  TARGET_RTX_COSTS
- #define TARGET_RTX_COSTS arm_rtx_costs
-@@ -654,12 +657,6 @@ static const struct attribute_spec arm_attribute_table[] =
- #undef TARGET_PREFERRED_RELOAD_CLASS
- #define TARGET_PREFERRED_RELOAD_CLASS arm_preferred_reload_class
- 
--#undef TARGET_INVALID_PARAMETER_TYPE
--#define TARGET_INVALID_PARAMETER_TYPE arm_invalid_parameter_type
--
--#undef TARGET_INVALID_RETURN_TYPE
--#define TARGET_INVALID_RETURN_TYPE arm_invalid_return_type
--
- #undef TARGET_PROMOTED_TYPE
- #define TARGET_PROMOTED_TYPE arm_promoted_type
- 
-@@ -820,6 +817,13 @@ int arm_arch8 = 0;
- /* Nonzero if this chip supports the ARMv8.1 extensions.  */
- int arm_arch8_1 = 0;
- 
-+/* Nonzero if this chip supports the ARM Architecture 8.2 extensions.  */
-+int arm_arch8_2 = 0;
-+
-+/* Nonzero if this chip supports the FP16 instructions extension of ARM
-+   Architecture 8.2.  */
-+int arm_fp16_inst = 0;
-+
- /* Nonzero if this chip can benefit from load scheduling.  */
- int arm_ld_sched = 0;
- 
-@@ -852,6 +856,9 @@ int arm_tune_cortex_a9 = 0;
-    interworking clean.  */
- int arm_cpp_interwork = 0;
- 
-+/* Nonzero if chip supports Thumb 1.  */
-+int arm_arch_thumb1;
-+
- /* Nonzero if chip supports Thumb 2.  */
- int arm_arch_thumb2;
- 
-@@ -892,6 +899,9 @@ int arm_condexec_masklen = 0;
- /* Nonzero if chip supports the ARMv8 CRC instructions.  */
- int arm_arch_crc = 0;
- 
-+/* Nonzero if chip supports the ARMv8-M security extensions.  */
-+int arm_arch_cmse = 0;
-+
- /* Nonzero if the core has a very small, high-latency, multiply unit.  */
- int arm_m_profile_small_mul = 0;
- 
-@@ -1684,8 +1694,7 @@ const struct cpu_cost_table v7m_extra_costs =
- 
- const struct tune_params arm_slowmul_tune =
- {
--  arm_slowmul_rtx_costs,
--  NULL,					/* Insn extra costs.  */
-+  &generic_extra_costs,			/* Insn extra costs.  */
-   NULL,					/* Sched adj cost.  */
-   arm_default_branch_cost,
-   &arm_default_vec_cost,
-@@ -1707,8 +1716,7 @@ const struct tune_params arm_slowmul_tune =
- 
- const struct tune_params arm_fastmul_tune =
- {
--  arm_fastmul_rtx_costs,
--  NULL,					/* Insn extra costs.  */
-+  &generic_extra_costs,			/* Insn extra costs.  */
-   NULL,					/* Sched adj cost.  */
-   arm_default_branch_cost,
-   &arm_default_vec_cost,
-@@ -1733,8 +1741,7 @@ const struct tune_params arm_fastmul_tune =
- 
- const struct tune_params arm_strongarm_tune =
- {
--  arm_fastmul_rtx_costs,
--  NULL,					/* Insn extra costs.  */
-+  &generic_extra_costs,			/* Insn extra costs.  */
-   NULL,					/* Sched adj cost.  */
-   arm_default_branch_cost,
-   &arm_default_vec_cost,
-@@ -1756,8 +1763,7 @@ const struct tune_params arm_strongarm_tune =
- 
- const struct tune_params arm_xscale_tune =
- {
--  arm_xscale_rtx_costs,
--  NULL,					/* Insn extra costs.  */
-+  &generic_extra_costs,			/* Insn extra costs.  */
-   xscale_sched_adjust_cost,
-   arm_default_branch_cost,
-   &arm_default_vec_cost,
-@@ -1779,8 +1785,7 @@ const struct tune_params arm_xscale_tune =
- 
- const struct tune_params arm_9e_tune =
- {
--  arm_9e_rtx_costs,
--  NULL,					/* Insn extra costs.  */
-+  &generic_extra_costs,			/* Insn extra costs.  */
-   NULL,					/* Sched adj cost.  */
-   arm_default_branch_cost,
-   &arm_default_vec_cost,
-@@ -1802,8 +1807,7 @@ const struct tune_params arm_9e_tune =
- 
- const struct tune_params arm_marvell_pj4_tune =
- {
--  arm_9e_rtx_costs,
--  NULL,					/* Insn extra costs.  */
-+  &generic_extra_costs,			/* Insn extra costs.  */
-   NULL,					/* Sched adj cost.  */
-   arm_default_branch_cost,
-   &arm_default_vec_cost,
-@@ -1825,8 +1829,7 @@ const struct tune_params arm_marvell_pj4_tune =
- 
- const struct tune_params arm_v6t2_tune =
- {
--  arm_9e_rtx_costs,
--  NULL,					/* Insn extra costs.  */
-+  &generic_extra_costs,			/* Insn extra costs.  */
-   NULL,					/* Sched adj cost.  */
-   arm_default_branch_cost,
-   &arm_default_vec_cost,
-@@ -1850,7 +1853,6 @@ const struct tune_params arm_v6t2_tune =
- /* Generic Cortex tuning.  Use more specific tunings if appropriate.  */
- const struct tune_params arm_cortex_tune =
- {
--  arm_9e_rtx_costs,
-   &generic_extra_costs,
-   NULL,					/* Sched adj cost.  */
-   arm_default_branch_cost,
-@@ -1873,7 +1875,6 @@ const struct tune_params arm_cortex_tune =
- 
- const struct tune_params arm_cortex_a8_tune =
- {
--  arm_9e_rtx_costs,
-   &cortexa8_extra_costs,
-   NULL,					/* Sched adj cost.  */
-   arm_default_branch_cost,
-@@ -1896,7 +1897,6 @@ const struct tune_params arm_cortex_a8_tune =
- 
- const struct tune_params arm_cortex_a7_tune =
- {
--  arm_9e_rtx_costs,
-   &cortexa7_extra_costs,
-   NULL,					/* Sched adj cost.  */
-   arm_default_branch_cost,
-@@ -1919,7 +1919,6 @@ const struct tune_params arm_cortex_a7_tune =
- 
- const struct tune_params arm_cortex_a15_tune =
- {
--  arm_9e_rtx_costs,
-   &cortexa15_extra_costs,
-   NULL,					/* Sched adj cost.  */
-   arm_default_branch_cost,
-@@ -1942,7 +1941,6 @@ const struct tune_params arm_cortex_a15_tune =
- 
- const struct tune_params arm_cortex_a35_tune =
- {
--  arm_9e_rtx_costs,
-   &cortexa53_extra_costs,
-   NULL,					/* Sched adj cost.  */
-   arm_default_branch_cost,
-@@ -1965,7 +1963,6 @@ const struct tune_params arm_cortex_a35_tune =
- 
- const struct tune_params arm_cortex_a53_tune =
- {
--  arm_9e_rtx_costs,
-   &cortexa53_extra_costs,
-   NULL,					/* Sched adj cost.  */
-   arm_default_branch_cost,
-@@ -1988,7 +1985,6 @@ const struct tune_params arm_cortex_a53_tune =
- 
- const struct tune_params arm_cortex_a57_tune =
- {
--  arm_9e_rtx_costs,
-   &cortexa57_extra_costs,
-   NULL,					/* Sched adj cost.  */
-   arm_default_branch_cost,
-@@ -2011,7 +2007,6 @@ const struct tune_params arm_cortex_a57_tune =
- 
- const struct tune_params arm_exynosm1_tune =
- {
--  arm_9e_rtx_costs,
-   &exynosm1_extra_costs,
-   NULL,						/* Sched adj cost.  */
-   arm_default_branch_cost,
-@@ -2034,7 +2029,6 @@ const struct tune_params arm_exynosm1_tune =
- 
- const struct tune_params arm_xgene1_tune =
- {
--  arm_9e_rtx_costs,
-   &xgene1_extra_costs,
-   NULL,					/* Sched adj cost.  */
-   arm_default_branch_cost,
-@@ -2055,12 +2049,33 @@ const struct tune_params arm_xgene1_tune =
-   tune_params::SCHED_AUTOPREF_OFF
- };
- 
-+const struct tune_params arm_qdf24xx_tune =
-+{
-+  &qdf24xx_extra_costs,
-+  NULL,                                         /* Scheduler cost adjustment.  */
-+  arm_default_branch_cost,
-+  &arm_default_vec_cost,			/* Vectorizer costs.  */
-+  1,						/* Constant limit.  */
-+  2,						/* Max cond insns.  */
-+  8,						/* Memset max inline.  */
-+  4,						/* Issue rate.  */
-+  ARM_PREFETCH_BENEFICIAL (0, -1, 64),
-+  tune_params::PREF_CONST_POOL_FALSE,
-+  tune_params::PREF_LDRD_TRUE,
-+  tune_params::LOG_OP_NON_SHORT_CIRCUIT_TRUE,	/* Thumb.  */
-+  tune_params::LOG_OP_NON_SHORT_CIRCUIT_TRUE,	/* ARM.  */
-+  tune_params::DISPARAGE_FLAGS_ALL,
-+  tune_params::PREF_NEON_64_FALSE,
-+  tune_params::PREF_NEON_STRINGOPS_TRUE,
-+  FUSE_OPS (tune_params::FUSE_MOVW_MOVT),
-+  tune_params::SCHED_AUTOPREF_FULL
-+};
-+
- /* Branches can be dual-issued on Cortex-A5, so conditional execution is
-    less appealing.  Set max_insns_skipped to a low value.  */
- 
- const struct tune_params arm_cortex_a5_tune =
- {
--  arm_9e_rtx_costs,
-   &cortexa5_extra_costs,
-   NULL,					/* Sched adj cost.  */
-   arm_cortex_a5_branch_cost,
-@@ -2083,7 +2098,6 @@ const struct tune_params arm_cortex_a5_tune =
- 
- const struct tune_params arm_cortex_a9_tune =
- {
--  arm_9e_rtx_costs,
-   &cortexa9_extra_costs,
-   cortex_a9_sched_adjust_cost,
-   arm_default_branch_cost,
-@@ -2106,7 +2120,6 @@ const struct tune_params arm_cortex_a9_tune =
- 
- const struct tune_params arm_cortex_a12_tune =
- {
--  arm_9e_rtx_costs,
-   &cortexa12_extra_costs,
-   NULL,					/* Sched adj cost.  */
-   arm_default_branch_cost,
-@@ -2127,6 +2140,28 @@ const struct tune_params arm_cortex_a12_tune =
-   tune_params::SCHED_AUTOPREF_OFF
- };
- 
-+const struct tune_params arm_cortex_a73_tune =
-+{
-+  &cortexa57_extra_costs,
-+  NULL,						/* Sched adj cost.  */
-+  arm_default_branch_cost,
-+  &arm_default_vec_cost,			/* Vectorizer costs.  */
-+  1,						/* Constant limit.  */
-+  2,						/* Max cond insns.  */
-+  8,						/* Memset max inline.  */
-+  2,						/* Issue rate.  */
-+  ARM_PREFETCH_NOT_BENEFICIAL,
-+  tune_params::PREF_CONST_POOL_FALSE,
-+  tune_params::PREF_LDRD_TRUE,
-+  tune_params::LOG_OP_NON_SHORT_CIRCUIT_TRUE,		/* Thumb.  */
-+  tune_params::LOG_OP_NON_SHORT_CIRCUIT_TRUE,		/* ARM.  */
-+  tune_params::DISPARAGE_FLAGS_ALL,
-+  tune_params::PREF_NEON_64_FALSE,
-+  tune_params::PREF_NEON_STRINGOPS_TRUE,
-+  FUSE_OPS (tune_params::FUSE_AES_AESMC | tune_params::FUSE_MOVW_MOVT),
-+  tune_params::SCHED_AUTOPREF_FULL
-+};
-+
- /* armv7m tuning.  On Cortex-M4 cores for example, MOVW/MOVT take a single
-    cycle to execute each.  An LDR from the constant pool also takes two cycles
-    to execute, but mildly increases pipelining opportunity (consecutive
-@@ -2136,7 +2171,6 @@ const struct tune_params arm_cortex_a12_tune =
- 
- const struct tune_params arm_v7m_tune =
- {
--  arm_9e_rtx_costs,
-   &v7m_extra_costs,
-   NULL,					/* Sched adj cost.  */
-   arm_cortex_m_branch_cost,
-@@ -2161,7 +2195,6 @@ const struct tune_params arm_v7m_tune =
- 
- const struct tune_params arm_cortex_m7_tune =
- {
--  arm_9e_rtx_costs,
-   &v7m_extra_costs,
-   NULL,					/* Sched adj cost.  */
-   arm_cortex_m7_branch_cost,
-@@ -2183,11 +2216,11 @@ const struct tune_params arm_cortex_m7_tune =
- };
- 
- /* The arm_v6m_tune is duplicated from arm_cortex_tune, rather than
--   arm_v6t2_tune. It is used for cortex-m0, cortex-m1 and cortex-m0plus.  */
-+   arm_v6t2_tune.  It is used for cortex-m0, cortex-m1, cortex-m0plus and
-+   cortex-m23.  */
- const struct tune_params arm_v6m_tune =
- {
--  arm_9e_rtx_costs,
--  NULL,					/* Insn extra costs.  */
-+  &generic_extra_costs,			/* Insn extra costs.  */
-   NULL,					/* Sched adj cost.  */
-   arm_default_branch_cost,
-   &arm_default_vec_cost,                        /* Vectorizer costs.  */
-@@ -2209,8 +2242,7 @@ const struct tune_params arm_v6m_tune =
- 
- const struct tune_params arm_fa726te_tune =
- {
--  arm_9e_rtx_costs,
--  NULL,					/* Insn extra costs.  */
-+  &generic_extra_costs,				/* Insn extra costs.  */
-   fa726te_sched_adjust_cost,
-   arm_default_branch_cost,
-   &arm_default_vec_cost,
-@@ -2264,16 +2296,18 @@ static const struct processors *arm_selected_arch;
- static const struct processors *arm_selected_cpu;
- static const struct processors *arm_selected_tune;
- 
--/* The name of the preprocessor macro to define for this architecture.  */
-+/* The name of the preprocessor macro to define for this architecture.  PROFILE
-+   is replaced by the architecture name (eg. 8A) in arm_option_override () and
-+   is thus chosen to be big enough to hold the longest architecture name.  */
- 
--char arm_arch_name[] = "__ARM_ARCH_0UNK__";
-+char arm_arch_name[] = "__ARM_ARCH_PROFILE__";
- 
- /* Available values for -mfpu=.  */
- 
- const struct arm_fpu_desc all_fpus[] =
- {
--#define ARM_FPU(NAME, MODEL, REV, VFP_REGS, FEATURES) \
--  { NAME, MODEL, REV, VFP_REGS, FEATURES },
-+#define ARM_FPU(NAME, REV, VFP_REGS, FEATURES) \
-+  { NAME, REV, VFP_REGS, FEATURES },
- #include "arm-fpus.def"
- #undef ARM_FPU
- };
-@@ -2752,8 +2786,8 @@ arm_option_check_internal (struct gcc_options *opts)
-   const struct arm_fpu_desc *fpu_desc = &all_fpus[opts->x_arm_fpu_index];
- 
-   /* iWMMXt and NEON are incompatible.  */
--    if (TARGET_IWMMXT && TARGET_VFP
--      && ARM_FPU_FSET_HAS (fpu_desc->features, FPU_FL_NEON))
-+    if (TARGET_IWMMXT
-+	&& ARM_FPU_FSET_HAS (fpu_desc->features, FPU_FL_NEON))
-     error ("iWMMXt and NEON are incompatible");
- 
-   /* Make sure that the processor choice does not conflict with any of the
-@@ -2907,7 +2941,8 @@ arm_option_override_internal (struct gcc_options *opts,
-   if (! opts_set->x_arm_restrict_it)
-     opts->x_arm_restrict_it = arm_arch8;
- 
--  if (!TARGET_THUMB2_P (opts->x_target_flags))
-+  /* ARM execution state and M profile don't have [restrict] IT.  */
-+  if (!TARGET_THUMB2_P (opts->x_target_flags) || !arm_arch_notm)
-     opts->x_arm_restrict_it = 0;
- 
-   /* Enable -munaligned-access by default for
-@@ -2918,7 +2953,8 @@ arm_option_override_internal (struct gcc_options *opts,
- 
-      Disable -munaligned-access by default for
-      - all pre-ARMv6 architecture-based processors
--     - ARMv6-M architecture-based processors.  */
-+     - ARMv6-M architecture-based processors
-+     - ARMv8-M Baseline processors.  */
- 
-   if (! opts_set->x_unaligned_access)
-     {
-@@ -3152,9 +3188,6 @@ arm_option_override (void)
-   if (TARGET_APCS_REENT)
-     warning (0, "APCS reentrant code not supported.  Ignored");
- 
--  if (TARGET_APCS_FLOAT)
--    warning (0, "passing floating point arguments in fp regs not yet supported");
--
-   /* Initialize boolean versions of the flags, for use in the arm.md file.  */
-   arm_arch3m = ARM_FSET_HAS_CPU1 (insn_flags, FL_ARCH3M);
-   arm_arch4 = ARM_FSET_HAS_CPU1 (insn_flags, FL_ARCH4);
-@@ -3170,6 +3203,8 @@ arm_option_override (void)
-   arm_arch7em = ARM_FSET_HAS_CPU1 (insn_flags, FL_ARCH7EM);
-   arm_arch8 = ARM_FSET_HAS_CPU1 (insn_flags, FL_ARCH8);
-   arm_arch8_1 = ARM_FSET_HAS_CPU2 (insn_flags, FL2_ARCH8_1);
-+  arm_arch8_2 = ARM_FSET_HAS_CPU2 (insn_flags, FL2_ARCH8_2);
-+  arm_arch_thumb1 = ARM_FSET_HAS_CPU1 (insn_flags, FL_THUMB);
-   arm_arch_thumb2 = ARM_FSET_HAS_CPU1 (insn_flags, FL_THUMB2);
-   arm_arch_xscale = ARM_FSET_HAS_CPU1 (insn_flags, FL_XSCALE);
- 
-@@ -3184,7 +3219,15 @@ arm_option_override (void)
-   arm_arch_no_volatile_ce = ARM_FSET_HAS_CPU1 (insn_flags, FL_NO_VOLATILE_CE);
-   arm_tune_cortex_a9 = (arm_tune == cortexa9) != 0;
-   arm_arch_crc = ARM_FSET_HAS_CPU1 (insn_flags, FL_CRC32);
-+  arm_arch_cmse = ARM_FSET_HAS_CPU2 (insn_flags, FL2_CMSE);
-   arm_m_profile_small_mul = ARM_FSET_HAS_CPU1 (insn_flags, FL_SMALLMUL);
-+  arm_fp16_inst = ARM_FSET_HAS_CPU2 (insn_flags, FL2_FP16INST);
-+  if (arm_fp16_inst)
-+    {
-+      if (arm_fp16_format == ARM_FP16_FORMAT_ALTERNATIVE)
-+	error ("selected fp16 options are incompatible.");
-+      arm_fp16_format = ARM_FP16_FORMAT_IEEE;
-+    }
- 
-   /* V5 code we generate is completely interworking capable, so we turn off
-      TARGET_INTERWORK here to avoid many tests later on.  */
-@@ -3222,10 +3265,8 @@ arm_option_override (void)
-   /* If soft-float is specified then don't use FPU.  */
-   if (TARGET_SOFT_FLOAT)
-     arm_fpu_attr = FPU_NONE;
--  else if (TARGET_VFP)
--    arm_fpu_attr = FPU_VFP;
-   else
--    gcc_unreachable();
-+    arm_fpu_attr = FPU_VFP;
- 
-   if (TARGET_AAPCS_BASED)
-     {
-@@ -3245,15 +3286,14 @@ arm_option_override (void)
-       if (arm_abi == ARM_ABI_IWMMXT)
- 	arm_pcs_default = ARM_PCS_AAPCS_IWMMXT;
-       else if (arm_float_abi == ARM_FLOAT_ABI_HARD
--	       && TARGET_HARD_FLOAT
--	       && TARGET_VFP)
-+	       && TARGET_HARD_FLOAT)
- 	arm_pcs_default = ARM_PCS_AAPCS_VFP;
-       else
- 	arm_pcs_default = ARM_PCS_AAPCS;
-     }
-   else
-     {
--      if (arm_float_abi == ARM_FLOAT_ABI_HARD && TARGET_VFP)
-+      if (arm_float_abi == ARM_FLOAT_ABI_HARD)
- 	sorry ("-mfloat-abi=hard and VFP");
- 
-       if (arm_abi == ARM_ABI_APCS)
-@@ -3298,6 +3338,20 @@ arm_option_override (void)
- 	}
-     }
- 
-+  if (TARGET_VXWORKS_RTP)
-+    {
-+      if (!global_options_set.x_arm_pic_data_is_text_relative)
-+	arm_pic_data_is_text_relative = 0;
-+    }
-+  else if (flag_pic
-+	   && !arm_pic_data_is_text_relative
-+	   && !(global_options_set.x_target_flags & MASK_SINGLE_PIC_BASE))
-+    /* When text & data segments don't have a fixed displacement, the
-+       intended use is with a single, read only, pic base register.
-+       Unless the user explicitly requested not to do that, set
-+       it.  */
-+    target_flags |= MASK_SINGLE_PIC_BASE;
-+
-   /* If stack checking is disabled, we can use r10 as the PIC register,
-      which keeps r9 available.  The EABI specifies r9 as the PIC register.  */
-   if (flag_pic && TARGET_SINGLE_PIC_BASE)
-@@ -3329,10 +3383,6 @@ arm_option_override (void)
- 	arm_pic_register = pic_register;
-     }
- 
--  if (TARGET_VXWORKS_RTP
--      && !global_options_set.x_arm_pic_data_is_text_relative)
--    arm_pic_data_is_text_relative = 0;
--
-   /* Enable -mfix-cortex-m3-ldrd by default for Cortex-M3 cores.  */
-   if (fix_cm3_ldrd == 2)
-     {
-@@ -3436,6 +3486,9 @@ arm_option_override (void)
-   if (target_slow_flash_data)
-     arm_disable_literal_pool = true;
- 
-+  if (use_cmse && !arm_arch_cmse)
-+    error ("target CPU does not support ARMv8-M Security Extensions");
-+
-   /* Disable scheduling fusion by default if it's not armv7 processor
-      or doesn't prefer ldrd/strd.  */
-   if (flag_schedule_fusion == 2
-@@ -3568,6 +3621,9 @@ arm_compute_func_type (void)
-   else
-     type |= arm_isr_value (TREE_VALUE (a));
- 
-+  if (lookup_attribute ("cmse_nonsecure_entry", attr))
-+    type |= ARM_FT_CMSE_ENTRY;
-+
-   return type;
- }
- 
-@@ -3794,6 +3850,11 @@ use_return_insn (int iscond, rtx sibling)
- 	return 0;
-     }
- 
-+  /* ARMv8-M nonsecure entry function need to use bxns to return and thus need
-+     several instructions if anything needs to be popped.  */
-+  if (saved_int_regs && IS_CMSE_ENTRY (func_type))
-+    return 0;
-+
-   /* If there are saved registers but the LR isn't saved, then we need
-      two instructions for the return.  */
-   if (saved_int_regs && !(saved_int_regs & (1 << LR_REGNUM)))
-@@ -3801,7 +3862,7 @@ use_return_insn (int iscond, rtx sibling)
- 
-   /* Can't be done if any of the VFP regs are pushed,
-      since this also requires an insn.  */
--  if (TARGET_HARD_FLOAT && TARGET_VFP)
-+  if (TARGET_HARD_FLOAT)
-     for (regno = FIRST_VFP_REGNUM; regno <= LAST_VFP_REGNUM; regno++)
-       if (df_regs_ever_live_p (regno) && !call_used_regs[regno])
- 	return 0;
-@@ -3899,7 +3960,7 @@ const_ok_for_op (HOST_WIDE_INT i, enum rtx_code code)
-     {
-     case SET:
-       /* See if we can use movw.  */
--      if (arm_arch_thumb2 && (i & 0xffff0000) == 0)
-+      if (TARGET_HAVE_MOVT && (i & 0xffff0000) == 0)
- 	return 1;
-       else
- 	/* Otherwise, try mvn.  */
-@@ -4118,7 +4179,7 @@ optimal_immediate_sequence (enum rtx_code code, unsigned HOST_WIDE_INT val,
-      yield a shorter sequence, we may as well use zero.  */
-   insns1 = optimal_immediate_sequence_1 (code, val, return_sequence, best_start);
-   if (best_start != 0
--      && ((((unsigned HOST_WIDE_INT) 1) << best_start) < val))
-+      && ((HOST_WIDE_INT_1U << best_start) < val))
-     {
-       insns2 = optimal_immediate_sequence_1 (code, val, &tmp_sequence, 0);
-       if (insns2 <= insns1)
-@@ -4949,7 +5010,7 @@ arm_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
-   if (mode == VOIDmode)
-     mode = GET_MODE (*op1);
- 
--  maxval = (((unsigned HOST_WIDE_INT) 1) << (GET_MODE_BITSIZE(mode) - 1)) - 1;
-+  maxval = (HOST_WIDE_INT_1U << (GET_MODE_BITSIZE (mode) - 1)) - 1;
- 
-   /* For DImode, we have GE/LT/GEU/LTU comparisons.  In ARM mode
-      we can also use cmp/cmpeq for GTU/LEU.  GT/LE must be either
-@@ -5255,7 +5316,6 @@ arm_function_value_regno_p (const unsigned int regno)
-   if (regno == ARG_REGISTER (1)
-       || (TARGET_32BIT
- 	  && TARGET_AAPCS_BASED
--	  && TARGET_VFP
- 	  && TARGET_HARD_FLOAT
- 	  && regno == FIRST_VFP_REGNUM)
-       || (TARGET_IWMMXT_ABI
-@@ -5274,7 +5334,7 @@ arm_apply_result_size (void)
- 
-   if (TARGET_32BIT)
-     {
--      if (TARGET_HARD_FLOAT_ABI && TARGET_VFP)
-+      if (TARGET_HARD_FLOAT_ABI)
- 	size += 32;
-       if (TARGET_IWMMXT_ABI)
- 	size += 8;
-@@ -5549,7 +5609,7 @@ aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
-     {
-     case REAL_TYPE:
-       mode = TYPE_MODE (type);
--      if (mode != DFmode && mode != SFmode)
-+      if (mode != DFmode && mode != SFmode && mode != HFmode)
- 	return -1;
- 
-       if (*modep == VOIDmode)
-@@ -5722,7 +5782,7 @@ use_vfp_abi (enum arm_pcs pcs_variant, bool is_double)
-   if (pcs_variant != ARM_PCS_AAPCS_LOCAL)
-     return false;
- 
--  return (TARGET_32BIT && TARGET_VFP && TARGET_HARD_FLOAT &&
-+  return (TARGET_32BIT && TARGET_HARD_FLOAT &&
- 	  (TARGET_VFP_DOUBLE || !is_double));
- }
- 
-@@ -5797,11 +5857,16 @@ aapcs_vfp_is_call_candidate (CUMULATIVE_ARGS *pcum, machine_mode mode,
- 						&pcum->aapcs_vfp_rcount);
- }
- 
-+/* Implement the allocate field in aapcs_cp_arg_layout.  See the comment there
-+   for the behaviour of this function.  */
-+
- static bool
- aapcs_vfp_allocate (CUMULATIVE_ARGS *pcum, machine_mode mode,
- 		    const_tree type  ATTRIBUTE_UNUSED)
- {
--  int shift = GET_MODE_SIZE (pcum->aapcs_vfp_rmode) / GET_MODE_SIZE (SFmode);
-+  int rmode_size
-+    = MAX (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), GET_MODE_SIZE (SFmode));
-+  int shift = rmode_size / GET_MODE_SIZE (SFmode);
-   unsigned mask = (1 << (shift * pcum->aapcs_vfp_rcount)) - 1;
-   int regno;
- 
-@@ -5850,6 +5915,9 @@ aapcs_vfp_allocate (CUMULATIVE_ARGS *pcum, machine_mode mode,
-   return false;
- }
- 
-+/* Implement the allocate_return_reg field in aapcs_cp_arg_layout.  See the
-+   comment there for the behaviour of this function.  */
-+
- static rtx
- aapcs_vfp_allocate_return_reg (enum arm_pcs pcs_variant ATTRIBUTE_UNUSED,
- 			       machine_mode mode,
-@@ -5940,13 +6008,13 @@ static struct
-      required for a return from FUNCTION_ARG.  */
-   bool (*allocate) (CUMULATIVE_ARGS *, machine_mode, const_tree);
- 
--  /* Return true if a result of mode MODE (or type TYPE if MODE is
--     BLKmode) is can be returned in this co-processor's registers.  */
-+  /* Return true if a result of mode MODE (or type TYPE if MODE is BLKmode) can
-+     be returned in this co-processor's registers.  */
-   bool (*is_return_candidate) (enum arm_pcs, machine_mode, const_tree);
- 
--  /* Allocate and return an RTX element to hold the return type of a
--     call, this routine must not fail and will only be called if
--     is_return_candidate returned true with the same parameters.  */
-+  /* Allocate and return an RTX element to hold the return type of a call.  This
-+     routine must not fail and will only be called if is_return_candidate
-+     returned true with the same parameters.  */
-   rtx (*allocate_return_reg) (enum arm_pcs, machine_mode, const_tree);
- 
-   /* Finish processing this argument and prepare to start processing
-@@ -6561,6 +6629,185 @@ arm_handle_notshared_attribute (tree *node,
- }
- #endif
- 
-+/* This function returns true if a function with declaration FNDECL and type
-+   FNTYPE uses the stack to pass arguments or return variables and false
-+   otherwise.  This is used for functions with the attributes
-+   'cmse_nonsecure_call' or 'cmse_nonsecure_entry' and this function will issue
-+   diagnostic messages if the stack is used.  NAME is the name of the attribute
-+   used.  */
-+
-+static bool
-+cmse_func_args_or_return_in_stack (tree fndecl, tree name, tree fntype)
-+{
-+  function_args_iterator args_iter;
-+  CUMULATIVE_ARGS args_so_far_v;
-+  cumulative_args_t args_so_far;
-+  bool first_param = true;
-+  tree arg_type, prev_arg_type = NULL_TREE, ret_type;
-+
-+  /* Error out if any argument is passed on the stack.  */
-+  arm_init_cumulative_args (&args_so_far_v, fntype, NULL_RTX, fndecl);
-+  args_so_far = pack_cumulative_args (&args_so_far_v);
-+  FOREACH_FUNCTION_ARGS (fntype, arg_type, args_iter)
-+    {
-+      rtx arg_rtx;
-+      machine_mode arg_mode = TYPE_MODE (arg_type);
-+
-+      prev_arg_type = arg_type;
-+      if (VOID_TYPE_P (arg_type))
-+	continue;
-+
-+      if (!first_param)
-+	arm_function_arg_advance (args_so_far, arg_mode, arg_type, true);
-+      arg_rtx = arm_function_arg (args_so_far, arg_mode, arg_type, true);
-+      if (!arg_rtx
-+	  || arm_arg_partial_bytes (args_so_far, arg_mode, arg_type, true))
-+	{
-+	  error ("%qE attribute not available to functions with arguments "
-+		 "passed on the stack", name);
-+	  return true;
-+	}
-+      first_param = false;
-+    }
-+
-+  /* Error out for variadic functions since we cannot control how many
-+     arguments will be passed and thus stack could be used.  stdarg_p () is not
-+     used for the checking to avoid browsing arguments twice.  */
-+  if (prev_arg_type != NULL_TREE && !VOID_TYPE_P (prev_arg_type))
-+    {
-+      error ("%qE attribute not available to functions with variable number "
-+	     "of arguments", name);
-+      return true;
-+    }
-+
-+  /* Error out if return value is passed on the stack.  */
-+  ret_type = TREE_TYPE (fntype);
-+  if (arm_return_in_memory (ret_type, fntype))
-+    {
-+      error ("%qE attribute not available to functions that return value on "
-+	     "the stack", name);
-+      return true;
-+    }
-+  return false;
-+}
-+
-+/* Called upon detection of the use of the cmse_nonsecure_entry attribute, this
-+   function will check whether the attribute is allowed here and will add the
-+   attribute to the function declaration tree or otherwise issue a warning.  */
-+
-+static tree
-+arm_handle_cmse_nonsecure_entry (tree *node, tree name,
-+				 tree /* args */,
-+				 int /* flags */,
-+				 bool *no_add_attrs)
-+{
-+  tree fndecl;
-+
-+  if (!use_cmse)
-+    {
-+      *no_add_attrs = true;
-+      warning (OPT_Wattributes, "%qE attribute ignored without -mcmse option.",
-+	       name);
-+      return NULL_TREE;
-+    }
-+
-+  /* Ignore attribute for function types.  */
-+  if (TREE_CODE (*node) != FUNCTION_DECL)
-+    {
-+      warning (OPT_Wattributes, "%qE attribute only applies to functions",
-+	       name);
-+      *no_add_attrs = true;
-+      return NULL_TREE;
-+    }
-+
-+  fndecl = *node;
-+
-+  /* Warn for static linkage functions.  */
-+  if (!TREE_PUBLIC (fndecl))
-+    {
-+      warning (OPT_Wattributes, "%qE attribute has no effect on functions "
-+	       "with static linkage", name);
-+      *no_add_attrs = true;
-+      return NULL_TREE;
-+    }
-+
-+  *no_add_attrs |= cmse_func_args_or_return_in_stack (fndecl, name,
-+						TREE_TYPE (fndecl));
-+  return NULL_TREE;
-+}
-+
-+
-+/* Called upon detection of the use of the cmse_nonsecure_call attribute, this
-+   function will check whether the attribute is allowed here and will add the
-+   attribute to the function type tree or otherwise issue a diagnostic.  The
-+   reason we check this at declaration time is to only allow the use of the
-+   attribute with declarations of function pointers and not function
-+   declarations.  This function checks NODE is of the expected type and issues
-+   diagnostics otherwise using NAME.  If it is not of the expected type
-+   *NO_ADD_ATTRS will be set to true.  */
-+
-+static tree
-+arm_handle_cmse_nonsecure_call (tree *node, tree name,
-+				 tree /* args */,
-+				 int /* flags */,
-+				 bool *no_add_attrs)
-+{
-+  tree decl = NULL_TREE, fntype = NULL_TREE;
-+  tree type;
-+
-+  if (!use_cmse)
-+    {
-+      *no_add_attrs = true;
-+      warning (OPT_Wattributes, "%qE attribute ignored without -mcmse option.",
-+	       name);
-+      return NULL_TREE;
-+    }
-+
-+  if (TREE_CODE (*node) == VAR_DECL || TREE_CODE (*node) == TYPE_DECL)
-+    {
-+      decl = *node;
-+      fntype = TREE_TYPE (decl);
-+    }
-+
-+  while (fntype != NULL_TREE && TREE_CODE (fntype) == POINTER_TYPE)
-+    fntype = TREE_TYPE (fntype);
-+
-+  if (!decl || TREE_CODE (fntype) != FUNCTION_TYPE)
-+    {
-+	warning (OPT_Wattributes, "%qE attribute only applies to base type of a "
-+		 "function pointer", name);
-+	*no_add_attrs = true;
-+	return NULL_TREE;
-+    }
-+
-+  *no_add_attrs |= cmse_func_args_or_return_in_stack (NULL, name, fntype);
-+
-+  if (*no_add_attrs)
-+    return NULL_TREE;
-+
-+  /* Prevent trees being shared among function types with and without
-+     cmse_nonsecure_call attribute.  */
-+  type = TREE_TYPE (decl);
-+
-+  type = build_distinct_type_copy (type);
-+  TREE_TYPE (decl) = type;
-+  fntype = type;
-+
-+  while (TREE_CODE (fntype) != FUNCTION_TYPE)
-+    {
-+      type = fntype;
-+      fntype = TREE_TYPE (fntype);
-+      fntype = build_distinct_type_copy (fntype);
-+      TREE_TYPE (type) = fntype;
-+    }
-+
-+  /* Construct a type attribute and add it to the function type.  */
-+  tree attrs = tree_cons (get_identifier ("cmse_nonsecure_call"), NULL_TREE,
-+			  TYPE_ATTRIBUTES (fntype));
-+  TYPE_ATTRIBUTES (fntype) = attrs;
-+  return NULL_TREE;
-+}
-+
- /* Return 0 if the attributes for two types are incompatible, 1 if they
-    are compatible, and 2 if they are nearly compatible (which causes a
-    warning to be generated).  */
-@@ -6601,6 +6848,14 @@ arm_comp_type_attributes (const_tree type1, const_tree type2)
-   if (l1 != l2)
-     return 0;
- 
-+  l1 = lookup_attribute ("cmse_nonsecure_call",
-+			 TYPE_ATTRIBUTES (type1)) != NULL;
-+  l2 = lookup_attribute ("cmse_nonsecure_call",
-+			 TYPE_ATTRIBUTES (type2)) != NULL;
-+
-+  if (l1 != l2)
-+    return 0;
-+
-   return 1;
- }
- 
-@@ -6711,7 +6966,7 @@ arm_function_ok_for_sibcall (tree decl, tree exp)
-      may be used both as target of the call and base register for restoring
-      the VFP registers  */
-   if (TARGET_APCS_FRAME && TARGET_ARM
--      && TARGET_HARD_FLOAT && TARGET_VFP
-+      && TARGET_HARD_FLOAT
-       && decl && arm_is_long_call_p (decl))
-     return false;
- 
-@@ -6727,6 +6982,20 @@ arm_function_ok_for_sibcall (tree decl, tree exp)
-   if (IS_INTERRUPT (func_type))
-     return false;
- 
-+  /* ARMv8-M non-secure entry functions need to return with bxns which is only
-+     generated for entry functions themselves.  */
-+  if (IS_CMSE_ENTRY (arm_current_func_type ()))
-+    return false;
-+
-+  /* We do not allow ARMv8-M non-secure calls to be turned into sibling calls,
-+     this would complicate matters for later code generation.  */
-+  if (TREE_CODE (exp) == CALL_EXPR)
-+    {
-+      tree fntype = TREE_TYPE (TREE_TYPE (CALL_EXPR_FN (exp)));
-+      if (lookup_attribute ("cmse_nonsecure_call", TYPE_ATTRIBUTES (fntype)))
-+	return false;
-+    }
-+
-   if (!VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
-     {
-       /* Check that the return value locations are the same.  For
-@@ -7187,8 +7456,7 @@ arm_legitimate_address_outer_p (machine_mode mode, rtx x, RTX_CODE outer,
-     return 1;
- 
-   use_ldrd = (TARGET_LDRD
--	      && (mode == DImode
--		  || (mode == DFmode && (TARGET_SOFT_FLOAT || TARGET_VFP))));
-+	      && (mode == DImode || mode == DFmode));
- 
-   if (code == POST_INC || code == PRE_DEC
-       || ((code == PRE_INC || code == POST_DEC)
-@@ -7273,8 +7541,7 @@ thumb2_legitimate_address_p (machine_mode mode, rtx x, int strict_p)
-     return 1;
- 
-   use_ldrd = (TARGET_LDRD
--	      && (mode == DImode
--		  || (mode == DFmode && (TARGET_SOFT_FLOAT || TARGET_VFP))));
-+	      && (mode == DImode || mode == DFmode));
- 
-   if (code == POST_INC || code == PRE_DEC
-       || ((code == PRE_INC || code == POST_DEC)
-@@ -7367,7 +7634,6 @@ arm_legitimate_index_p (machine_mode mode, rtx index, RTX_CODE outer,
- 
-   /* Standard coprocessor addressing modes.  */
-   if (TARGET_HARD_FLOAT
--      && TARGET_VFP
-       && (mode == SFmode || mode == DFmode))
-     return (code == CONST_INT && INTVAL (index) < 1024
- 	    && INTVAL (index) > -1024
-@@ -7487,7 +7753,6 @@ thumb2_legitimate_index_p (machine_mode mode, rtx index, int strict_p)
-   /* ??? Combine arm and thumb2 coprocessor addressing modes.  */
-   /* Standard coprocessor addressing modes.  */
-   if (TARGET_HARD_FLOAT
--      && TARGET_VFP
-       && (mode == SFmode || mode == DFmode))
-     return (code == CONST_INT && INTVAL (index) < 1024
- 	    /* Thumb-2 allows only > -256 index range for it's core register
-@@ -8033,8 +8298,7 @@ arm_legitimize_address (rtx x, rtx orig_x, machine_mode mode)
- 
- 	  /* VFP addressing modes actually allow greater offsets, but for
- 	     now we just stick with the lowest common denominator.  */
--	  if (mode == DImode
--	      || ((TARGET_SOFT_FLOAT || TARGET_VFP) && mode == DFmode))
-+	  if (mode == DImode || mode == DFmode)
- 	    {
- 	      low_n = n & 0x0f;
- 	      n &= ~0x0f;
-@@ -8226,6 +8490,12 @@ arm_legitimate_constant_p_1 (machine_mode, rtx x)
- static bool
- thumb_legitimate_constant_p (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
- {
-+  /* Splitters for TARGET_USE_MOVT call arm_emit_movpair which creates high
-+     RTX.  These RTX must therefore be allowed for Thumb-1 so that when run
-+     for ARMv8-M Baseline or later the result is valid.  */
-+  if (TARGET_HAVE_MOVT && GET_CODE (x) == HIGH)
-+    x = XEXP (x, 0);
-+
-   return (CONST_INT_P (x)
- 	  || CONST_DOUBLE_P (x)
- 	  || CONSTANT_ADDRESS_P (x)
-@@ -8312,7 +8582,9 @@ thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
-     case CONST_INT:
-       if (outer == SET)
- 	{
--	  if ((unsigned HOST_WIDE_INT) INTVAL (x) < 256)
-+	  if (UINTVAL (x) < 256
-+	      /* 16-bit constant.  */
-+	      || (TARGET_HAVE_MOVT && !(INTVAL (x) & 0xffff0000)))
- 	    return 0;
- 	  if (thumb_shiftable_const (INTVAL (x)))
- 	    return COSTS_N_INSNS (2);
-@@ -8329,8 +8601,8 @@ thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
- 	  int i;
- 	  /* This duplicates the tests in the andsi3 expander.  */
- 	  for (i = 9; i <= 31; i++)
--	    if ((((HOST_WIDE_INT) 1) << i) - 1 == INTVAL (x)
--		|| (((HOST_WIDE_INT) 1) << i) - 1 == ~INTVAL (x))
-+	    if ((HOST_WIDE_INT_1 << i) - 1 == INTVAL (x)
-+		|| (HOST_WIDE_INT_1 << i) - 1 == ~INTVAL (x))
- 	      return COSTS_N_INSNS (2);
- 	}
-       else if (outer == ASHIFT || outer == ASHIFTRT
-@@ -8393,1006 +8665,162 @@ thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
-     }
- }
- 
--static inline bool
--arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
-+/* Estimates the size cost of thumb1 instructions.
-+   For now most of the code is copied from thumb1_rtx_costs. We need more
-+   fine grain tuning when we have more related test cases.  */
-+static inline int
-+thumb1_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
- {
-   machine_mode mode = GET_MODE (x);
--  enum rtx_code subcode;
--  rtx operand;
--  enum rtx_code code = GET_CODE (x);
--  *total = 0;
-+  int words, cost;
- 
-   switch (code)
-     {
--    case MEM:
--      /* Memory costs quite a lot for the first word, but subsequent words
--	 load at the equivalent of a single insn each.  */
--      *total = COSTS_N_INSNS (2 + ARM_NUM_REGS (mode));
--      return true;
-+    case ASHIFT:
-+    case ASHIFTRT:
-+    case LSHIFTRT:
-+    case ROTATERT:
-+      return (mode == SImode) ? COSTS_N_INSNS (1) : COSTS_N_INSNS (2);
- 
--    case DIV:
--    case MOD:
--    case UDIV:
--    case UMOD:
--      if (TARGET_HARD_FLOAT && mode == SFmode)
--	*total = COSTS_N_INSNS (2);
--      else if (TARGET_HARD_FLOAT && mode == DFmode && !TARGET_VFP_SINGLE)
--	*total = COSTS_N_INSNS (4);
--      else
--	*total = COSTS_N_INSNS (20);
--      return false;
-+    case PLUS:
-+    case MINUS:
-+      /* Thumb-1 needs two instructions to fulfill shiftadd/shiftsub0/shiftsub1
-+	 defined by RTL expansion, especially for the expansion of
-+	 multiplication.  */
-+      if ((GET_CODE (XEXP (x, 0)) == MULT
-+	   && power_of_two_operand (XEXP (XEXP (x,0),1), SImode))
-+	  || (GET_CODE (XEXP (x, 1)) == MULT
-+	      && power_of_two_operand (XEXP (XEXP (x, 1), 1), SImode)))
-+	return COSTS_N_INSNS (2);
-+      /* On purpose fall through for normal RTX.  */
-+    case COMPARE:
-+    case NEG:
-+    case NOT:
-+      return COSTS_N_INSNS (1);
- 
--    case ROTATE:
--      if (REG_P (XEXP (x, 1)))
--	*total = COSTS_N_INSNS (1); /* Need to subtract from 32 */
--      else if (!CONST_INT_P (XEXP (x, 1)))
--	*total = rtx_cost (XEXP (x, 1), mode, code, 1, speed);
-+    case MULT:
-+      if (CONST_INT_P (XEXP (x, 1)))
-+        {
-+          /* Thumb1 mul instruction can't operate on const. We must Load it
-+             into a register first.  */
-+          int const_size = thumb1_size_rtx_costs (XEXP (x, 1), CONST_INT, SET);
-+	  /* For the targets which have a very small and high-latency multiply
-+	     unit, we prefer to synthesize the mult with up to 5 instructions,
-+	     giving a good balance between size and performance.  */
-+	  if (arm_arch6m && arm_m_profile_small_mul)
-+	    return COSTS_N_INSNS (5);
-+	  else
-+	    return COSTS_N_INSNS (1) + const_size;
-+        }
-+      return COSTS_N_INSNS (1);
- 
--      /* Fall through */
--    case ROTATERT:
--      if (mode != SImode)
--	{
--	  *total += COSTS_N_INSNS (4);
--	  return true;
--	}
-+    case SET:
-+      /* A SET doesn't have a mode, so let's look at the SET_DEST to get
-+	 the mode.  */
-+      words = ARM_NUM_INTS (GET_MODE_SIZE (GET_MODE (SET_DEST (x))));
-+      cost = COSTS_N_INSNS (words);
-+      if (satisfies_constraint_J (SET_SRC (x))
-+	  || satisfies_constraint_K (SET_SRC (x))
-+	     /* Too big an immediate for a 2-byte mov, using MOVT.  */
-+	  || (CONST_INT_P (SET_SRC (x))
-+	      && UINTVAL (SET_SRC (x)) >= 256
-+	      && TARGET_HAVE_MOVT
-+	      && satisfies_constraint_j (SET_SRC (x)))
-+	     /* thumb1_movdi_insn.  */
-+	  || ((words > 1) && MEM_P (SET_SRC (x))))
-+	cost += COSTS_N_INSNS (1);
-+      return cost;
- 
--      /* Fall through */
--    case ASHIFT: case LSHIFTRT: case ASHIFTRT:
--      *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
--      if (mode == DImode)
--	{
--	  *total += COSTS_N_INSNS (3);
--	  return true;
--	}
-+    case CONST_INT:
-+      if (outer == SET)
-+        {
-+          if (UINTVAL (x) < 256)
-+            return COSTS_N_INSNS (1);
-+	  /* movw is 4byte long.  */
-+	  if (TARGET_HAVE_MOVT && !(INTVAL (x) & 0xffff0000))
-+	    return COSTS_N_INSNS (2);
-+	  /* See split "TARGET_THUMB1 && satisfies_constraint_J".  */
-+	  if (INTVAL (x) >= -255 && INTVAL (x) <= -1)
-+            return COSTS_N_INSNS (2);
-+	  /* See split "TARGET_THUMB1 && satisfies_constraint_K".  */
-+          if (thumb_shiftable_const (INTVAL (x)))
-+            return COSTS_N_INSNS (2);
-+          return COSTS_N_INSNS (3);
-+        }
-+      else if ((outer == PLUS || outer == COMPARE)
-+               && INTVAL (x) < 256 && INTVAL (x) > -256)
-+        return 0;
-+      else if ((outer == IOR || outer == XOR || outer == AND)
-+               && INTVAL (x) < 256 && INTVAL (x) >= -256)
-+        return COSTS_N_INSNS (1);
-+      else if (outer == AND)
-+        {
-+          int i;
-+          /* This duplicates the tests in the andsi3 expander.  */
-+          for (i = 9; i <= 31; i++)
-+            if ((HOST_WIDE_INT_1 << i) - 1 == INTVAL (x)
-+                || (HOST_WIDE_INT_1 << i) - 1 == ~INTVAL (x))
-+              return COSTS_N_INSNS (2);
-+        }
-+      else if (outer == ASHIFT || outer == ASHIFTRT
-+               || outer == LSHIFTRT)
-+        return 0;
-+      return COSTS_N_INSNS (2);
- 
--      *total += COSTS_N_INSNS (1);
--      /* Increase the cost of complex shifts because they aren't any faster,
--         and reduce dual issue opportunities.  */
--      if (arm_tune_cortex_a9
--	  && outer != SET && !CONST_INT_P (XEXP (x, 1)))
--	++*total;
-+    case CONST:
-+    case CONST_DOUBLE:
-+    case LABEL_REF:
-+    case SYMBOL_REF:
-+      return COSTS_N_INSNS (3);
- 
--      return true;
-+    case UDIV:
-+    case UMOD:
-+    case DIV:
-+    case MOD:
-+      return 100;
- 
--    case MINUS:
--      if (mode == DImode)
--	{
--	  *total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
--	  if (CONST_INT_P (XEXP (x, 0))
--	      && const_ok_for_arm (INTVAL (XEXP (x, 0))))
--	    {
--	      *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
--	      return true;
--	    }
--
--	  if (CONST_INT_P (XEXP (x, 1))
--	      && const_ok_for_arm (INTVAL (XEXP (x, 1))))
--	    {
--	      *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
--	      return true;
--	    }
--
--	  return false;
--	}
--
--      if (GET_MODE_CLASS (mode) == MODE_FLOAT)
--	{
--	  if (TARGET_HARD_FLOAT
--	      && (mode == SFmode
--		  || (mode == DFmode && !TARGET_VFP_SINGLE)))
--	    {
--	      *total = COSTS_N_INSNS (1);
--	      if (CONST_DOUBLE_P (XEXP (x, 0))
--		  && arm_const_double_rtx (XEXP (x, 0)))
--		{
--		  *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
--		  return true;
--		}
--
--	      if (CONST_DOUBLE_P (XEXP (x, 1))
--		  && arm_const_double_rtx (XEXP (x, 1)))
--		{
--		  *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
--		  return true;
--		}
--
--	      return false;
--	    }
--	  *total = COSTS_N_INSNS (20);
--	  return false;
--	}
--
--      *total = COSTS_N_INSNS (1);
--      if (CONST_INT_P (XEXP (x, 0))
--	  && const_ok_for_arm (INTVAL (XEXP (x, 0))))
--	{
--	  *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
--	  return true;
--	}
--
--      subcode = GET_CODE (XEXP (x, 1));
--      if (subcode == ASHIFT || subcode == ASHIFTRT
--	  || subcode == LSHIFTRT
--	  || subcode == ROTATE || subcode == ROTATERT)
--	{
--	  *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
--	  *total += rtx_cost (XEXP (XEXP (x, 1), 0), mode, subcode, 0, speed);
--	  return true;
--	}
--
--      /* A shift as a part of RSB costs no more than RSB itself.  */
--      if (GET_CODE (XEXP (x, 0)) == MULT
--	  && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
--	{
--	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, code, 0, speed);
--	  *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
--	  return true;
--	}
--
--      if (subcode == MULT
--	  && power_of_two_operand (XEXP (XEXP (x, 1), 1), SImode))
--	{
--	  *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
--	  *total += rtx_cost (XEXP (XEXP (x, 1), 0), mode, subcode, 0, speed);
--	  return true;
--	}
--
--      if (GET_RTX_CLASS (GET_CODE (XEXP (x, 1))) == RTX_COMPARE
--	  || GET_RTX_CLASS (GET_CODE (XEXP (x, 1))) == RTX_COMM_COMPARE)
--	{
--	  *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), mode, code,
--						 0, speed);
--	  if (REG_P (XEXP (XEXP (x, 1), 0))
--	      && REGNO (XEXP (XEXP (x, 1), 0)) != CC_REGNUM)
--	    *total += COSTS_N_INSNS (1);
--
--	  return true;
--	}
--
--      /* Fall through */
--
--    case PLUS:
--      if (code == PLUS && arm_arch6 && mode == SImode
--	  && (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND
--	      || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND))
--	{
--	  *total = COSTS_N_INSNS (1);
--	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), VOIDmode,
--			      GET_CODE (XEXP (x, 0)), 0, speed);
--	  *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
--	  return true;
--	}
--
--      /* MLA: All arguments must be registers.  We filter out
--	 multiplication by a power of two, so that we fall down into
--	 the code below.  */
--      if (GET_CODE (XEXP (x, 0)) == MULT
--	  && !power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
--	{
--	  /* The cost comes from the cost of the multiply.  */
--	  return false;
--	}
--
--      if (GET_MODE_CLASS (mode) == MODE_FLOAT)
--	{
--	  if (TARGET_HARD_FLOAT
--	      && (mode == SFmode
--		  || (mode == DFmode && !TARGET_VFP_SINGLE)))
--	    {
--	      *total = COSTS_N_INSNS (1);
--	      if (CONST_DOUBLE_P (XEXP (x, 1))
--		  && arm_const_double_rtx (XEXP (x, 1)))
--		{
--		  *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
--		  return true;
--		}
--
--	      return false;
--	    }
--
--	  *total = COSTS_N_INSNS (20);
--	  return false;
--	}
--
--      if (GET_RTX_CLASS (GET_CODE (XEXP (x, 0))) == RTX_COMPARE
--	  || GET_RTX_CLASS (GET_CODE (XEXP (x, 0))) == RTX_COMM_COMPARE)
--	{
--	  *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 1), mode, code,
--						 1, speed);
--	  if (REG_P (XEXP (XEXP (x, 0), 0))
--	      && REGNO (XEXP (XEXP (x, 0), 0)) != CC_REGNUM)
--	    *total += COSTS_N_INSNS (1);
--	  return true;
--	}
--
--      /* Fall through */
--
--    case AND: case XOR: case IOR:
--
--      /* Normally the frame registers will be spilt into reg+const during
--	 reload, so it is a bad idea to combine them with other instructions,
--	 since then they might not be moved outside of loops.  As a compromise
--	 we allow integration with ops that have a constant as their second
--	 operand.  */
--      if (REG_OR_SUBREG_REG (XEXP (x, 0))
--	  && ARM_FRAME_RTX (REG_OR_SUBREG_RTX (XEXP (x, 0)))
--	  && !CONST_INT_P (XEXP (x, 1)))
--	*total = COSTS_N_INSNS (1);
--
--      if (mode == DImode)
--	{
--	  *total += COSTS_N_INSNS (2);
--	  if (CONST_INT_P (XEXP (x, 1))
--	      && const_ok_for_op (INTVAL (XEXP (x, 1)), code))
--	    {
--	      *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
--	      return true;
--	    }
--
--	  return false;
--	}
--
--      *total += COSTS_N_INSNS (1);
--      if (CONST_INT_P (XEXP (x, 1))
--	  && const_ok_for_op (INTVAL (XEXP (x, 1)), code))
--	{
--	  *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
--	  return true;
--	}
--      subcode = GET_CODE (XEXP (x, 0));
--      if (subcode == ASHIFT || subcode == ASHIFTRT
--	  || subcode == LSHIFTRT
--	  || subcode == ROTATE || subcode == ROTATERT)
--	{
--	  *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
--	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, subcode, 0, speed);
--	  return true;
--	}
--
--      if (subcode == MULT
--	  && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
--	{
--	  *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
--	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, subcode, 0, speed);
--	  return true;
--	}
--
--      if (subcode == UMIN || subcode == UMAX
--	  || subcode == SMIN || subcode == SMAX)
--	{
--	  *total = COSTS_N_INSNS (3);
--	  return true;
--	}
--
--      return false;
--
--    case MULT:
--      /* This should have been handled by the CPU specific routines.  */
--      gcc_unreachable ();
--
--    case TRUNCATE:
--      if (arm_arch3m && mode == SImode
--	  && GET_CODE (XEXP (x, 0)) == LSHIFTRT
--	  && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
--	  && (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0))
--	      == GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)))
--	  && (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
--	      || GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND))
--	{
--	  *total = rtx_cost (XEXP (XEXP (x, 0), 0), VOIDmode, LSHIFTRT,
--			     0, speed);
--	  return true;
--	}
--      *total = COSTS_N_INSNS (2); /* Plus the cost of the MULT */
--      return false;
--
--    case NEG:
--      if (GET_MODE_CLASS (mode) == MODE_FLOAT)
--	{
--	  if (TARGET_HARD_FLOAT
--	      && (mode == SFmode
--		  || (mode == DFmode && !TARGET_VFP_SINGLE)))
--	    {
--	      *total = COSTS_N_INSNS (1);
--	      return false;
--	    }
--	  *total = COSTS_N_INSNS (2);
--	  return false;
--	}
--
--      /* Fall through */
--    case NOT:
--      *total = COSTS_N_INSNS (ARM_NUM_REGS(mode));
--      if (mode == SImode && code == NOT)
--	{
--	  subcode = GET_CODE (XEXP (x, 0));
--	  if (subcode == ASHIFT || subcode == ASHIFTRT
--	      || subcode == LSHIFTRT
--	      || subcode == ROTATE || subcode == ROTATERT
--	      || (subcode == MULT
--		  && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode)))
--	    {
--	      *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, subcode,
--				  0, speed);
--	      /* Register shifts cost an extra cycle.  */
--	      if (!CONST_INT_P (XEXP (XEXP (x, 0), 1)))
--		*total += COSTS_N_INSNS (1) + rtx_cost (XEXP (XEXP (x, 0), 1),
--							mode, subcode,
--							1, speed);
--	      return true;
--	    }
--	}
--
--      return false;
--
--    case IF_THEN_ELSE:
--      if (GET_CODE (XEXP (x, 1)) == PC || GET_CODE (XEXP (x, 2)) == PC)
--	{
--	  *total = COSTS_N_INSNS (4);
--	  return true;
--	}
--
--      operand = XEXP (x, 0);
--
--      if (!((GET_RTX_CLASS (GET_CODE (operand)) == RTX_COMPARE
--	     || GET_RTX_CLASS (GET_CODE (operand)) == RTX_COMM_COMPARE)
--	    && REG_P (XEXP (operand, 0))
--	    && REGNO (XEXP (operand, 0)) == CC_REGNUM))
--	*total += COSTS_N_INSNS (1);
--      *total += rtx_cost (XEXP (x, 1), VOIDmode, code, 1, speed);
--      *total += rtx_cost (XEXP (x, 2), VOIDmode, code, 2, speed);
--      return true;
--
--    case NE:
--      if (mode == SImode && XEXP (x, 1) == const0_rtx)
--	{
--	  *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), mode, code,
--						 0, speed);
--	  return true;
--	}
--      goto scc_insn;
--
--    case GE:
--      if ((!REG_P (XEXP (x, 0)) || REGNO (XEXP (x, 0)) != CC_REGNUM)
--	  && mode == SImode && XEXP (x, 1) == const0_rtx)
--	{
--	  *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), mode, code,
--						 0, speed);
--	  return true;
--	}
--      goto scc_insn;
--
--    case LT:
--      if ((!REG_P (XEXP (x, 0)) || REGNO (XEXP (x, 0)) != CC_REGNUM)
--	  && mode == SImode && XEXP (x, 1) == const0_rtx)
--	{
--	  *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), mode, code,
--						 0, speed);
--	  return true;
--	}
--      goto scc_insn;
--
--    case EQ:
--    case GT:
--    case LE:
--    case GEU:
--    case LTU:
--    case GTU:
--    case LEU:
--    case UNORDERED:
--    case ORDERED:
--    case UNEQ:
--    case UNGE:
--    case UNLT:
--    case UNGT:
--    case UNLE:
--    scc_insn:
--      /* SCC insns.  In the case where the comparison has already been
--	 performed, then they cost 2 instructions.  Otherwise they need
--	 an additional comparison before them.  */
--      *total = COSTS_N_INSNS (2);
--      if (REG_P (XEXP (x, 0)) && REGNO (XEXP (x, 0)) == CC_REGNUM)
--	{
--	  return true;
--	}
--
--      /* Fall through */
--    case COMPARE:
--      if (REG_P (XEXP (x, 0)) && REGNO (XEXP (x, 0)) == CC_REGNUM)
--	{
--	  *total = 0;
--	  return true;
--	}
--
--      *total += COSTS_N_INSNS (1);
--      if (CONST_INT_P (XEXP (x, 1))
--	  && const_ok_for_op (INTVAL (XEXP (x, 1)), code))
--	{
--	  *total += rtx_cost (XEXP (x, 0), VOIDmode, code, 0, speed);
--	  return true;
--	}
--
--      subcode = GET_CODE (XEXP (x, 0));
--      if (subcode == ASHIFT || subcode == ASHIFTRT
--	  || subcode == LSHIFTRT
--	  || subcode == ROTATE || subcode == ROTATERT)
--	{
--	  mode = GET_MODE (XEXP (x, 0));
--	  *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
--	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, subcode, 0, speed);
--	  return true;
--	}
--
--      if (subcode == MULT
--	  && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
--	{
--	  mode = GET_MODE (XEXP (x, 0));
--	  *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
--	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, subcode, 0, speed);
--	  return true;
--	}
--
--      return false;
--
--    case UMIN:
--    case UMAX:
--    case SMIN:
--    case SMAX:
--      *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), mode, code, 0, speed);
--      if (!CONST_INT_P (XEXP (x, 1))
--	  || !const_ok_for_arm (INTVAL (XEXP (x, 1))))
--	*total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
--      return true;
--
--    case ABS:
--      if (GET_MODE_CLASS (mode) == MODE_FLOAT)
--	{
--	  if (TARGET_HARD_FLOAT
--	      && (mode == SFmode
--		  || (mode == DFmode && !TARGET_VFP_SINGLE)))
--	    {
--	      *total = COSTS_N_INSNS (1);
--	      return false;
--	    }
--	  *total = COSTS_N_INSNS (20);
--	  return false;
--	}
--      *total = COSTS_N_INSNS (1);
--      if (mode == DImode)
--	*total += COSTS_N_INSNS (3);
--      return false;
--
--    case SIGN_EXTEND:
--    case ZERO_EXTEND:
--      *total = 0;
--      if (GET_MODE_CLASS (mode) == MODE_INT)
--	{
--	  rtx op = XEXP (x, 0);
--	  machine_mode opmode = GET_MODE (op);
--
--	  if (mode == DImode)
--	    *total += COSTS_N_INSNS (1);
--
--	  if (opmode != SImode)
--	    {
--	      if (MEM_P (op))
--		{
--		  /* If !arm_arch4, we use one of the extendhisi2_mem
--		     or movhi_bytes patterns for HImode.  For a QImode
--		     sign extension, we first zero-extend from memory
--		     and then perform a shift sequence.  */
--		  if (!arm_arch4 && (opmode != QImode || code == SIGN_EXTEND))
--		    *total += COSTS_N_INSNS (2);
--		}
--	      else if (arm_arch6)
--		*total += COSTS_N_INSNS (1);
--
--	      /* We don't have the necessary insn, so we need to perform some
--		 other operation.  */
--	      else if (TARGET_ARM && code == ZERO_EXTEND && mode == QImode)
--		/* An and with constant 255.  */
--		*total += COSTS_N_INSNS (1);
--	      else
--		/* A shift sequence.  Increase costs slightly to avoid
--		   combining two shifts into an extend operation.  */
--		*total += COSTS_N_INSNS (2) + 1;
--	    }
--
--	  return false;
--	}
--
--      switch (GET_MODE (XEXP (x, 0)))
--	{
--	case V8QImode:
--	case V4HImode:
--	case V2SImode:
--	case V4QImode:
--	case V2HImode:
--	  *total = COSTS_N_INSNS (1);
--	  return false;
--
--	default:
--	  gcc_unreachable ();
--	}
--      gcc_unreachable ();
--
--    case ZERO_EXTRACT:
--    case SIGN_EXTRACT:
--      mode = GET_MODE (XEXP (x, 0));
--      *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), mode, code, 0, speed);
--      return true;
--
--    case CONST_INT:
--      if (const_ok_for_arm (INTVAL (x))
--	  || const_ok_for_arm (~INTVAL (x)))
--	*total = COSTS_N_INSNS (1);
--      else
--	*total = COSTS_N_INSNS (arm_gen_constant (SET, mode, NULL_RTX,
--						  INTVAL (x), NULL_RTX,
--						  NULL_RTX, 0, 0));
--      return true;
--
--    case CONST:
--    case LABEL_REF:
--    case SYMBOL_REF:
--      *total = COSTS_N_INSNS (3);
--      return true;
--
--    case HIGH:
--      *total = COSTS_N_INSNS (1);
--      return true;
--
--    case LO_SUM:
--      *total = COSTS_N_INSNS (1);
--      *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
--      return true;
--
--    case CONST_DOUBLE:
--      if (TARGET_HARD_FLOAT && vfp3_const_double_rtx (x)
--	  && (mode == SFmode || !TARGET_VFP_SINGLE))
--	*total = COSTS_N_INSNS (1);
--      else
--	*total = COSTS_N_INSNS (4);
--      return true;
--
--    case SET:
--      /* The vec_extract patterns accept memory operands that require an
--	 address reload.  Account for the cost of that reload to give the
--	 auto-inc-dec pass an incentive to try to replace them.  */
--      if (TARGET_NEON && MEM_P (SET_DEST (x))
--	  && GET_CODE (SET_SRC (x)) == VEC_SELECT)
--	{
--	  mode = GET_MODE (SET_DEST (x));
--	  *total = rtx_cost (SET_DEST (x), mode, code, 0, speed);
--	  if (!neon_vector_mem_operand (SET_DEST (x), 2, true))
--	    *total += COSTS_N_INSNS (1);
--	  return true;
--	}
--      /* Likewise for the vec_set patterns.  */
--      if (TARGET_NEON && GET_CODE (SET_SRC (x)) == VEC_MERGE
--	  && GET_CODE (XEXP (SET_SRC (x), 0)) == VEC_DUPLICATE
--	  && MEM_P (XEXP (XEXP (SET_SRC (x), 0), 0)))
--	{
--	  rtx mem = XEXP (XEXP (SET_SRC (x), 0), 0);
--	  mode = GET_MODE (SET_DEST (x));
--	  *total = rtx_cost (mem, mode, code, 0, speed);
--	  if (!neon_vector_mem_operand (mem, 2, true))
--	    *total += COSTS_N_INSNS (1);
--	  return true;
--	}
--      return false;
--
--    case UNSPEC:
--      /* We cost this as high as our memory costs to allow this to
--	 be hoisted from loops.  */
--      if (XINT (x, 1) == UNSPEC_PIC_UNIFIED)
--	{
--	  *total = COSTS_N_INSNS (2 + ARM_NUM_REGS (mode));
--	}
--      return true;
--
--    case CONST_VECTOR:
--      if (TARGET_NEON
--	  && TARGET_HARD_FLOAT
--	  && outer == SET
--	  && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode))
--	  && neon_immediate_valid_for_move (x, mode, NULL, NULL))
--	*total = COSTS_N_INSNS (1);
--      else
--	*total = COSTS_N_INSNS (4);
--      return true;
--
--    default:
--      *total = COSTS_N_INSNS (4);
--      return false;
--    }
--}
--
--/* Estimates the size cost of thumb1 instructions.
--   For now most of the code is copied from thumb1_rtx_costs. We need more
--   fine grain tuning when we have more related test cases.  */
--static inline int
--thumb1_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
--{
--  machine_mode mode = GET_MODE (x);
--  int words;
--
--  switch (code)
--    {
--    case ASHIFT:
--    case ASHIFTRT:
--    case LSHIFTRT:
--    case ROTATERT:
--      return (mode == SImode) ? COSTS_N_INSNS (1) : COSTS_N_INSNS (2);
--
--    case PLUS:
--    case MINUS:
--      /* Thumb-1 needs two instructions to fulfill shiftadd/shiftsub0/shiftsub1
--	 defined by RTL expansion, especially for the expansion of
--	 multiplication.  */
--      if ((GET_CODE (XEXP (x, 0)) == MULT
--	   && power_of_two_operand (XEXP (XEXP (x,0),1), SImode))
--	  || (GET_CODE (XEXP (x, 1)) == MULT
--	      && power_of_two_operand (XEXP (XEXP (x, 1), 1), SImode)))
--	return COSTS_N_INSNS (2);
--      /* On purpose fall through for normal RTX.  */
--    case COMPARE:
--    case NEG:
--    case NOT:
--      return COSTS_N_INSNS (1);
--
--    case MULT:
--      if (CONST_INT_P (XEXP (x, 1)))
--        {
--          /* Thumb1 mul instruction can't operate on const. We must Load it
--             into a register first.  */
--          int const_size = thumb1_size_rtx_costs (XEXP (x, 1), CONST_INT, SET);
--	  /* For the targets which have a very small and high-latency multiply
--	     unit, we prefer to synthesize the mult with up to 5 instructions,
--	     giving a good balance between size and performance.  */
--	  if (arm_arch6m && arm_m_profile_small_mul)
--	    return COSTS_N_INSNS (5);
--	  else
--	    return COSTS_N_INSNS (1) + const_size;
--        }
--      return COSTS_N_INSNS (1);
--
--    case SET:
--      /* A SET doesn't have a mode, so let's look at the SET_DEST to get
--	 the mode.  */
--      words = ARM_NUM_INTS (GET_MODE_SIZE (GET_MODE (SET_DEST (x))));
--      return COSTS_N_INSNS (words)
--	     + COSTS_N_INSNS (1) * (satisfies_constraint_J (SET_SRC (x))
--				    || satisfies_constraint_K (SET_SRC (x))
--				       /* thumb1_movdi_insn.  */
--				    || ((words > 1) && MEM_P (SET_SRC (x))));
--
--    case CONST_INT:
--      if (outer == SET)
--        {
--          if ((unsigned HOST_WIDE_INT) INTVAL (x) < 256)
--            return COSTS_N_INSNS (1);
--	  /* See split "TARGET_THUMB1 && satisfies_constraint_J".  */
--	  if (INTVAL (x) >= -255 && INTVAL (x) <= -1)
--            return COSTS_N_INSNS (2);
--	  /* See split "TARGET_THUMB1 && satisfies_constraint_K".  */
--          if (thumb_shiftable_const (INTVAL (x)))
--            return COSTS_N_INSNS (2);
--          return COSTS_N_INSNS (3);
--        }
--      else if ((outer == PLUS || outer == COMPARE)
--               && INTVAL (x) < 256 && INTVAL (x) > -256)
--        return 0;
--      else if ((outer == IOR || outer == XOR || outer == AND)
--               && INTVAL (x) < 256 && INTVAL (x) >= -256)
--        return COSTS_N_INSNS (1);
--      else if (outer == AND)
--        {
--          int i;
--          /* This duplicates the tests in the andsi3 expander.  */
--          for (i = 9; i <= 31; i++)
--            if ((((HOST_WIDE_INT) 1) << i) - 1 == INTVAL (x)
--                || (((HOST_WIDE_INT) 1) << i) - 1 == ~INTVAL (x))
--              return COSTS_N_INSNS (2);
--        }
--      else if (outer == ASHIFT || outer == ASHIFTRT
--               || outer == LSHIFTRT)
--        return 0;
--      return COSTS_N_INSNS (2);
--
--    case CONST:
--    case CONST_DOUBLE:
--    case LABEL_REF:
--    case SYMBOL_REF:
--      return COSTS_N_INSNS (3);
--
--    case UDIV:
--    case UMOD:
--    case DIV:
--    case MOD:
--      return 100;
--
--    case TRUNCATE:
--      return 99;
--
--    case AND:
--    case XOR:
--    case IOR:
--      return COSTS_N_INSNS (1);
--
--    case MEM:
--      return (COSTS_N_INSNS (1)
--	      + COSTS_N_INSNS (1)
--		* ((GET_MODE_SIZE (mode) - 1) / UNITS_PER_WORD)
--              + ((GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
--                 ? COSTS_N_INSNS (1) : 0));
--
--    case IF_THEN_ELSE:
--      /* XXX a guess.  */
--      if (GET_CODE (XEXP (x, 1)) == PC || GET_CODE (XEXP (x, 2)) == PC)
--        return 14;
--      return 2;
--
--    case ZERO_EXTEND:
--      /* XXX still guessing.  */
--      switch (GET_MODE (XEXP (x, 0)))
--        {
--          case QImode:
--            return (1 + (mode == DImode ? 4 : 0)
--                    + (MEM_P (XEXP (x, 0)) ? 10 : 0));
--
--          case HImode:
--            return (4 + (mode == DImode ? 4 : 0)
--                    + (MEM_P (XEXP (x, 0)) ? 10 : 0));
--
--          case SImode:
--            return (1 + (MEM_P (XEXP (x, 0)) ? 10 : 0));
--
--          default:
--            return 99;
--        }
--
--    default:
--      return 99;
--    }
--}
--
--/* RTX costs when optimizing for size.  */
--static bool
--arm_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
--		    int *total)
--{
--  machine_mode mode = GET_MODE (x);
--  if (TARGET_THUMB1)
--    {
--      *total = thumb1_size_rtx_costs (x, code, outer_code);
--      return true;
--    }
--
--  /* FIXME: This makes no attempt to prefer narrow Thumb-2 instructions.  */
--  switch (code)
--    {
--    case MEM:
--      /* A memory access costs 1 insn if the mode is small, or the address is
--	 a single register, otherwise it costs one insn per word.  */
--      if (REG_P (XEXP (x, 0)))
--	*total = COSTS_N_INSNS (1);
--      else if (flag_pic
--	       && GET_CODE (XEXP (x, 0)) == PLUS
--	       && will_be_in_index_register (XEXP (XEXP (x, 0), 1)))
--	/* This will be split into two instructions.
--	   See arm.md:calculate_pic_address.  */
--	*total = COSTS_N_INSNS (2);
--      else
--	*total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
--      return true;
--
--    case DIV:
--    case MOD:
--    case UDIV:
--    case UMOD:
--      /* Needs a libcall, so it costs about this.  */
--      *total = COSTS_N_INSNS (2);
--      return false;
--
--    case ROTATE:
--      if (mode == SImode && REG_P (XEXP (x, 1)))
--	{
--	  *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), mode, code,
--						 0, false);
--	  return true;
--	}
--      /* Fall through */
--    case ROTATERT:
--    case ASHIFT:
--    case LSHIFTRT:
--    case ASHIFTRT:
--      if (mode == DImode && CONST_INT_P (XEXP (x, 1)))
--	{
--	  *total = COSTS_N_INSNS (3) + rtx_cost (XEXP (x, 0), mode, code,
--						 0, false);
--	  return true;
--	}
--      else if (mode == SImode)
--	{
--	  *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), mode, code,
--						 0, false);
--	  /* Slightly disparage register shifts, but not by much.  */
--	  if (!CONST_INT_P (XEXP (x, 1)))
--	    *total += 1 + rtx_cost (XEXP (x, 1), mode, code, 1, false);
--	  return true;
--	}
--
--      /* Needs a libcall.  */
--      *total = COSTS_N_INSNS (2);
--      return false;
--
--    case MINUS:
--      if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
--	  && (mode == SFmode || !TARGET_VFP_SINGLE))
--	{
--	  *total = COSTS_N_INSNS (1);
--	  return false;
--	}
--
--      if (mode == SImode)
--	{
--	  enum rtx_code subcode0 = GET_CODE (XEXP (x, 0));
--	  enum rtx_code subcode1 = GET_CODE (XEXP (x, 1));
--
--	  if (subcode0 == ROTATE || subcode0 == ROTATERT || subcode0 == ASHIFT
--	      || subcode0 == LSHIFTRT || subcode0 == ASHIFTRT
--	      || subcode1 == ROTATE || subcode1 == ROTATERT
--	      || subcode1 == ASHIFT || subcode1 == LSHIFTRT
--	      || subcode1 == ASHIFTRT)
--	    {
--	      /* It's just the cost of the two operands.  */
--	      *total = 0;
--	      return false;
--	    }
--
--	  *total = COSTS_N_INSNS (1);
--	  return false;
--	}
--
--      *total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
--      return false;
--
--    case PLUS:
--      if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
--	  && (mode == SFmode || !TARGET_VFP_SINGLE))
--	{
--	  *total = COSTS_N_INSNS (1);
--	  return false;
--	}
--
--      /* A shift as a part of ADD costs nothing.  */
--      if (GET_CODE (XEXP (x, 0)) == MULT
--	  && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
--	{
--	  *total = COSTS_N_INSNS (TARGET_THUMB2 ? 2 : 1);
--	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, code, 0, false);
--	  *total += rtx_cost (XEXP (x, 1), mode, code, 1, false);
--	  return true;
--	}
--
--      /* Fall through */
--    case AND: case XOR: case IOR:
--      if (mode == SImode)
--	{
--	  enum rtx_code subcode = GET_CODE (XEXP (x, 0));
--
--	  if (subcode == ROTATE || subcode == ROTATERT || subcode == ASHIFT
--	      || subcode == LSHIFTRT || subcode == ASHIFTRT
--	      || (code == AND && subcode == NOT))
--	    {
--	      /* It's just the cost of the two operands.  */
--	      *total = 0;
--	      return false;
--	    }
--	}
--
--      *total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
--      return false;
--
--    case MULT:
--      *total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
--      return false;
--
--    case NEG:
--      if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
--	  && (mode == SFmode || !TARGET_VFP_SINGLE))
--	{
--	  *total = COSTS_N_INSNS (1);
--	  return false;
--	}
--
--      /* Fall through */
--    case NOT:
--      *total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
--
--      return false;
-+    case TRUNCATE:
-+      return 99;
- 
--    case IF_THEN_ELSE:
--      *total = 0;
--      return false;
-+    case AND:
-+    case XOR:
-+    case IOR:
-+      return COSTS_N_INSNS (1);
- 
--    case COMPARE:
--      if (cc_register (XEXP (x, 0), VOIDmode))
--	* total = 0;
--      else
--	*total = COSTS_N_INSNS (1);
--      return false;
-+    case MEM:
-+      return (COSTS_N_INSNS (1)
-+	      + COSTS_N_INSNS (1)
-+		* ((GET_MODE_SIZE (mode) - 1) / UNITS_PER_WORD)
-+              + ((GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
-+                 ? COSTS_N_INSNS (1) : 0));
- 
--    case ABS:
--      if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
--	  && (mode == SFmode || !TARGET_VFP_SINGLE))
--	*total = COSTS_N_INSNS (1);
--      else
--	*total = COSTS_N_INSNS (1 + ARM_NUM_REGS (mode));
--      return false;
-+    case IF_THEN_ELSE:
-+      /* XXX a guess.  */
-+      if (GET_CODE (XEXP (x, 1)) == PC || GET_CODE (XEXP (x, 2)) == PC)
-+        return 14;
-+      return 2;
- 
--    case SIGN_EXTEND:
-     case ZERO_EXTEND:
--      return arm_rtx_costs_1 (x, outer_code, total, 0);
--
--    case CONST_INT:
--      if (const_ok_for_arm (INTVAL (x)))
--	/* A multiplication by a constant requires another instruction
--	   to load the constant to a register.  */
--	*total = COSTS_N_INSNS ((outer_code == SET || outer_code == MULT)
--				? 1 : 0);
--      else if (const_ok_for_arm (~INTVAL (x)))
--	*total = COSTS_N_INSNS (outer_code == AND ? 0 : 1);
--      else if (const_ok_for_arm (-INTVAL (x)))
--	{
--	  if (outer_code == COMPARE || outer_code == PLUS
--	      || outer_code == MINUS)
--	    *total = 0;
--	  else
--	    *total = COSTS_N_INSNS (1);
--	}
--      else
--	*total = COSTS_N_INSNS (2);
--      return true;
--
--    case CONST:
--    case LABEL_REF:
--    case SYMBOL_REF:
--      *total = COSTS_N_INSNS (2);
--      return true;
--
--    case CONST_DOUBLE:
--      *total = COSTS_N_INSNS (4);
--      return true;
-+      /* XXX still guessing.  */
-+      switch (GET_MODE (XEXP (x, 0)))
-+        {
-+          case QImode:
-+            return (1 + (mode == DImode ? 4 : 0)
-+                    + (MEM_P (XEXP (x, 0)) ? 10 : 0));
- 
--    case CONST_VECTOR:
--      if (TARGET_NEON
--	  && TARGET_HARD_FLOAT
--	  && outer_code == SET
--	  && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode))
--	  && neon_immediate_valid_for_move (x, mode, NULL, NULL))
--	*total = COSTS_N_INSNS (1);
--      else
--	*total = COSTS_N_INSNS (4);
--      return true;
-+          case HImode:
-+            return (4 + (mode == DImode ? 4 : 0)
-+                    + (MEM_P (XEXP (x, 0)) ? 10 : 0));
- 
--    case HIGH:
--    case LO_SUM:
--      /* We prefer constant pool entries to MOVW/MOVT pairs, so bump the
--	 cost of these slightly.  */
--      *total = COSTS_N_INSNS (1) + 1;
--      return true;
-+          case SImode:
-+            return (1 + (MEM_P (XEXP (x, 0)) ? 10 : 0));
- 
--    case SET:
--      return false;
-+          default:
-+            return 99;
-+        }
- 
-     default:
--      if (mode != VOIDmode)
--	*total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
--      else
--	*total = COSTS_N_INSNS (4); /* How knows?  */
--      return false;
-+      return 99;
-     }
- }
- 
-@@ -9519,7 +8947,7 @@ arm_unspec_cost (rtx x, enum rtx_code /* outer_code */, bool speed_p, int *cost)
-    flags are live or not, and thus no realistic way to determine what
-    the size will eventually be.  */
- static bool
--arm_new_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
-+arm_rtx_costs_internal (rtx x, enum rtx_code code, enum rtx_code outer_code,
- 		   const struct cpu_cost_table *extra_cost,
- 		   int *cost, bool speed_p)
- {
-@@ -10771,8 +10199,6 @@ arm_new_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
-       if ((arm_arch4 || GET_MODE (XEXP (x, 0)) == SImode)
- 	  && MEM_P (XEXP (x, 0)))
- 	{
--	  *cost = rtx_cost (XEXP (x, 0), VOIDmode, code, 0, speed_p);
--
- 	  if (mode == DImode)
- 	    *cost += COSTS_N_INSNS (1);
- 
-@@ -11164,390 +10590,70 @@ arm_new_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
- 	  /* Vector costs? */
- 	}
-       *cost = LIBCALL_COST (1);
--      return false;
--
--    case FLOAT:
--    case UNSIGNED_FLOAT:
--      if (TARGET_HARD_FLOAT)
--	{
--	  /* ??? Increase the cost to deal with transferring from CORE
--	     -> FP registers?  */
--	  if (speed_p)
--	    *cost += extra_cost->fp[mode == DFmode].fromint;
--	  return false;
--	}
--      *cost = LIBCALL_COST (1);
--      return false;
--
--    case CALL:
--      return true;
--
--    case ASM_OPERANDS:
--      {
--      /* Just a guess.  Guess number of instructions in the asm
--         plus one insn per input.  Always a minimum of COSTS_N_INSNS (1)
--         though (see PR60663).  */
--        int asm_length = MAX (1, asm_str_count (ASM_OPERANDS_TEMPLATE (x)));
--        int num_operands = ASM_OPERANDS_INPUT_LENGTH (x);
--
--        *cost = COSTS_N_INSNS (asm_length + num_operands);
--        return true;
--      }
--    default:
--      if (mode != VOIDmode)
--	*cost = COSTS_N_INSNS (ARM_NUM_REGS (mode));
--      else
--	*cost = COSTS_N_INSNS (4); /* Who knows?  */
--      return false;
--    }
--}
--
--#undef HANDLE_NARROW_SHIFT_ARITH
--
--/* RTX costs when optimizing for size.  */
--static bool
--arm_rtx_costs (rtx x, machine_mode mode ATTRIBUTE_UNUSED, int outer_code,
--	       int opno ATTRIBUTE_UNUSED, int *total, bool speed)
--{
--  bool result;
--  int code = GET_CODE (x);
--
--  if (TARGET_OLD_RTX_COSTS
--      || (!current_tune->insn_extra_cost && !TARGET_NEW_GENERIC_COSTS))
--    {
--      /* Old way.  (Deprecated.)  */
--      if (!speed)
--	result = arm_size_rtx_costs (x, (enum rtx_code) code,
--				     (enum rtx_code) outer_code, total);
--      else
--	result = current_tune->rtx_costs (x,  (enum rtx_code) code,
--					  (enum rtx_code) outer_code, total,
--					  speed);
--    }
--  else
--    {
--    /* New way.  */
--      if (current_tune->insn_extra_cost)
--        result =  arm_new_rtx_costs (x, (enum rtx_code) code,
--				     (enum rtx_code) outer_code,
--				     current_tune->insn_extra_cost,
--				     total, speed);
--    /* TARGET_NEW_GENERIC_COSTS && !TARGET_OLD_RTX_COSTS
--       && current_tune->insn_extra_cost != NULL  */
--      else
--        result =  arm_new_rtx_costs (x, (enum rtx_code) code,
--				    (enum rtx_code) outer_code,
--				    &generic_extra_costs, total, speed);
--    }
--
--  if (dump_file && (dump_flags & TDF_DETAILS))
--    {
--      print_rtl_single (dump_file, x);
--      fprintf (dump_file, "\n%s cost: %d (%s)\n", speed ? "Hot" : "Cold",
--	       *total, result ? "final" : "partial");
--    }
--  return result;
--}
--
--/* RTX costs for cores with a slow MUL implementation.  Thumb-2 is not
--   supported on any "slowmul" cores, so it can be ignored.  */
--
--static bool
--arm_slowmul_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
--		       int *total, bool speed)
--{
--  machine_mode mode = GET_MODE (x);
--
--  if (TARGET_THUMB)
--    {
--      *total = thumb1_rtx_costs (x, code, outer_code);
--      return true;
--    }
--
--  switch (code)
--    {
--    case MULT:
--      if (GET_MODE_CLASS (mode) == MODE_FLOAT
--	  || mode == DImode)
--	{
--	  *total = COSTS_N_INSNS (20);
--	  return false;
--	}
--
--      if (CONST_INT_P (XEXP (x, 1)))
--	{
--	  unsigned HOST_WIDE_INT i = (INTVAL (XEXP (x, 1))
--				      & (unsigned HOST_WIDE_INT) 0xffffffff);
--	  int cost, const_ok = const_ok_for_arm (i);
--	  int j, booth_unit_size;
--
--	  /* Tune as appropriate.  */
--	  cost = const_ok ? 4 : 8;
--	  booth_unit_size = 2;
--	  for (j = 0; i && j < 32; j += booth_unit_size)
--	    {
--	      i >>= booth_unit_size;
--	      cost++;
--	    }
--
--	  *total = COSTS_N_INSNS (cost);
--	  *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
--	  return true;
--	}
--
--      *total = COSTS_N_INSNS (20);
--      return false;
--
--    default:
--      return arm_rtx_costs_1 (x, outer_code, total, speed);;
--    }
--}
--
--
--/* RTX cost for cores with a fast multiply unit (M variants).  */
--
--static bool
--arm_fastmul_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
--		       int *total, bool speed)
--{
--  machine_mode mode = GET_MODE (x);
--
--  if (TARGET_THUMB1)
--    {
--      *total = thumb1_rtx_costs (x, code, outer_code);
--      return true;
--    }
--
--  /* ??? should thumb2 use different costs?  */
--  switch (code)
--    {
--    case MULT:
--      /* There is no point basing this on the tuning, since it is always the
--	 fast variant if it exists at all.  */
--      if (mode == DImode
--	  && (GET_CODE (XEXP (x, 0)) == GET_CODE (XEXP (x, 1)))
--	  && (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND
--	      || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND))
--	{
--	  *total = COSTS_N_INSNS(2);
--	  return false;
--	}
--
--
--      if (mode == DImode)
--	{
--	  *total = COSTS_N_INSNS (5);
--	  return false;
--	}
--
--      if (CONST_INT_P (XEXP (x, 1)))
--	{
--	  unsigned HOST_WIDE_INT i = (INTVAL (XEXP (x, 1))
--				      & (unsigned HOST_WIDE_INT) 0xffffffff);
--	  int cost, const_ok = const_ok_for_arm (i);
--	  int j, booth_unit_size;
--
--	  /* Tune as appropriate.  */
--	  cost = const_ok ? 4 : 8;
--	  booth_unit_size = 8;
--	  for (j = 0; i && j < 32; j += booth_unit_size)
--	    {
--	      i >>= booth_unit_size;
--	      cost++;
--	    }
--
--	  *total = COSTS_N_INSNS(cost);
--	  return false;
--	}
--
--      if (mode == SImode)
--	{
--	  *total = COSTS_N_INSNS (4);
--	  return false;
--	}
--
--      if (GET_MODE_CLASS (mode) == MODE_FLOAT)
--	{
--	  if (TARGET_HARD_FLOAT
--	      && (mode == SFmode
--		  || (mode == DFmode && !TARGET_VFP_SINGLE)))
--	    {
--	      *total = COSTS_N_INSNS (1);
--	      return false;
--	    }
--	}
--
--      /* Requires a lib call */
--      *total = COSTS_N_INSNS (20);
--      return false;
--
--    default:
--      return arm_rtx_costs_1 (x, outer_code, total, speed);
--    }
--}
--
--
--/* RTX cost for XScale CPUs.  Thumb-2 is not supported on any xscale cores,
--   so it can be ignored.  */
--
--static bool
--arm_xscale_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
--		      int *total, bool speed)
--{
--  machine_mode mode = GET_MODE (x);
--
--  if (TARGET_THUMB)
--    {
--      *total = thumb1_rtx_costs (x, code, outer_code);
--      return true;
--    }
--
--  switch (code)
--    {
--    case COMPARE:
--      if (GET_CODE (XEXP (x, 0)) != MULT)
--	return arm_rtx_costs_1 (x, outer_code, total, speed);
--
--      /* A COMPARE of a MULT is slow on XScale; the muls instruction
--	 will stall until the multiplication is complete.  */
--      *total = COSTS_N_INSNS (3);
--      return false;
--
--    case MULT:
--      /* There is no point basing this on the tuning, since it is always the
--	 fast variant if it exists at all.  */
--      if (mode == DImode
--	  && (GET_CODE (XEXP (x, 0)) == GET_CODE (XEXP (x, 1)))
--	  && (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND
--	      || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND))
--	{
--	  *total = COSTS_N_INSNS (2);
--	  return false;
--	}
--
--
--      if (mode == DImode)
--	{
--	  *total = COSTS_N_INSNS (5);
--	  return false;
--	}
--
--      if (CONST_INT_P (XEXP (x, 1)))
--	{
--	  /* If operand 1 is a constant we can more accurately
--	     calculate the cost of the multiply.  The multiplier can
--	     retire 15 bits on the first cycle and a further 12 on the
--	     second.  We do, of course, have to load the constant into
--	     a register first.  */
--	  unsigned HOST_WIDE_INT i = INTVAL (XEXP (x, 1));
--	  /* There's a general overhead of one cycle.  */
--	  int cost = 1;
--	  unsigned HOST_WIDE_INT masked_const;
--
--	  if (i & 0x80000000)
--	    i = ~i;
--
--	  i &= (unsigned HOST_WIDE_INT) 0xffffffff;
--
--	  masked_const = i & 0xffff8000;
--	  if (masked_const != 0)
--	    {
--	      cost++;
--	      masked_const = i & 0xf8000000;
--	      if (masked_const != 0)
--		cost++;
--	    }
--	  *total = COSTS_N_INSNS (cost);
--	  return false;
--	}
-+      return false;
- 
--      if (mode == SImode)
-+    case FLOAT:
-+    case UNSIGNED_FLOAT:
-+      if (TARGET_HARD_FLOAT)
- 	{
--	  *total = COSTS_N_INSNS (3);
-+	  /* ??? Increase the cost to deal with transferring from CORE
-+	     -> FP registers?  */
-+	  if (speed_p)
-+	    *cost += extra_cost->fp[mode == DFmode].fromint;
- 	  return false;
- 	}
--
--      /* Requires a lib call */
--      *total = COSTS_N_INSNS (20);
-+      *cost = LIBCALL_COST (1);
-       return false;
- 
-+    case CALL:
-+      return true;
-+
-+    case ASM_OPERANDS:
-+      {
-+      /* Just a guess.  Guess number of instructions in the asm
-+         plus one insn per input.  Always a minimum of COSTS_N_INSNS (1)
-+         though (see PR60663).  */
-+        int asm_length = MAX (1, asm_str_count (ASM_OPERANDS_TEMPLATE (x)));
-+        int num_operands = ASM_OPERANDS_INPUT_LENGTH (x);
-+
-+        *cost = COSTS_N_INSNS (asm_length + num_operands);
-+        return true;
-+      }
-     default:
--      return arm_rtx_costs_1 (x, outer_code, total, speed);
-+      if (mode != VOIDmode)
-+	*cost = COSTS_N_INSNS (ARM_NUM_REGS (mode));
-+      else
-+	*cost = COSTS_N_INSNS (4); /* Who knows?  */
-+      return false;
-     }
- }
- 
-+#undef HANDLE_NARROW_SHIFT_ARITH
- 
--/* RTX costs for 9e (and later) cores.  */
-+/* RTX costs entry point.  */
- 
- static bool
--arm_9e_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
--		  int *total, bool speed)
-+arm_rtx_costs (rtx x, machine_mode mode ATTRIBUTE_UNUSED, int outer_code,
-+	       int opno ATTRIBUTE_UNUSED, int *total, bool speed)
- {
--  machine_mode mode = GET_MODE (x);
--
--  if (TARGET_THUMB1)
--    {
--      switch (code)
--	{
--	case MULT:
--	  /* Small multiply: 32 cycles for an integer multiply inst.  */
--	  if (arm_arch6m && arm_m_profile_small_mul)
--	    *total = COSTS_N_INSNS (32);
--	  else
--	    *total = COSTS_N_INSNS (3);
--	  return true;
-+  bool result;
-+  int code = GET_CODE (x);
-+  gcc_assert (current_tune->insn_extra_cost);
- 
--	default:
--	  *total = thumb1_rtx_costs (x, code, outer_code);
--	  return true;
--	}
--    }
-+  result =  arm_rtx_costs_internal (x, (enum rtx_code) code,
-+				(enum rtx_code) outer_code,
-+				current_tune->insn_extra_cost,
-+				total, speed);
- 
--  switch (code)
-+  if (dump_file && (dump_flags & TDF_DETAILS))
-     {
--    case MULT:
--      /* There is no point basing this on the tuning, since it is always the
--	 fast variant if it exists at all.  */
--      if (mode == DImode
--	  && (GET_CODE (XEXP (x, 0)) == GET_CODE (XEXP (x, 1)))
--	  && (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND
--	      || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND))
--	{
--	  *total = COSTS_N_INSNS (2);
--	  return false;
--	}
--
--
--      if (mode == DImode)
--	{
--	  *total = COSTS_N_INSNS (5);
--	  return false;
--	}
--
--      if (mode == SImode)
--	{
--	  *total = COSTS_N_INSNS (2);
--	  return false;
--	}
--
--      if (GET_MODE_CLASS (mode) == MODE_FLOAT)
--	{
--	  if (TARGET_HARD_FLOAT
--	      && (mode == SFmode
--		  || (mode == DFmode && !TARGET_VFP_SINGLE)))
--	    {
--	      *total = COSTS_N_INSNS (1);
--	      return false;
--	    }
--	}
--
--      *total = COSTS_N_INSNS (20);
--      return false;
--
--    default:
--      return arm_rtx_costs_1 (x, outer_code, total, speed);
-+      print_rtl_single (dump_file, x);
-+      fprintf (dump_file, "\n%s cost: %d (%s)\n", speed ? "Hot" : "Cold",
-+	       *total, result ? "final" : "partial");
-     }
-+  return result;
- }
-+
- /* All address computations that can be done are free, but rtx cost returns
-    the same for practically all of them.  So we weight the different types
-    of address here in the order (most pref first):
-@@ -12269,7 +11375,7 @@ vfp3_const_double_index (rtx x)
- 
-   /* We can permit four significant bits of mantissa only, plus a high bit
-      which is always 1.  */
--  mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
-+  mask = (HOST_WIDE_INT_1U << (point_pos - 5)) - 1;
-   if ((mantissa & mask) != 0)
-     return -1;
- 
-@@ -12423,6 +11529,12 @@ neon_valid_immediate (rtx op, machine_mode mode, int inverse,
- 	return 18;
-     }
- 
-+  /* The tricks done in the code below apply for little-endian vector layout.
-+     For big-endian vectors only allow vectors of the form { a, a, a..., a }.
-+     FIXME: Implement logic for big-endian vectors.  */
-+  if (BYTES_BIG_ENDIAN && vector && !const_vec_duplicate_p (op))
-+    return -1;
-+
-   /* Splat vector constant out into a byte vector.  */
-   for (i = 0; i < n_elts; i++)
-     {
-@@ -13151,7 +12263,7 @@ coproc_secondary_reload_class (machine_mode mode, rtx x, bool wb)
- {
-   if (mode == HFmode)
-     {
--      if (!TARGET_NEON_FP16)
-+      if (!TARGET_NEON_FP16 && !TARGET_VFP_FP16INST)
- 	return GENERAL_REGS;
-       if (s_register_operand (x, mode) || neon_vector_mem_operand (x, 2, true))
- 	return NO_REGS;
-@@ -15988,14 +15100,17 @@ gen_operands_ldrd_strd (rtx *operands, bool load,
-   /* If the same input register is used in both stores
-      when storing different constants, try to find a free register.
-      For example, the code
--        mov r0, 0
--        str r0, [r2]
--        mov r0, 1
--        str r0, [r2, #4]
-+	mov r0, 0
-+	str r0, [r2]
-+	mov r0, 1
-+	str r0, [r2, #4]
-      can be transformed into
--        mov r1, 0
--        strd r1, r0, [r2]
--     in Thumb mode assuming that r1 is free.  */
-+	mov r1, 0
-+	mov r0, 1
-+	strd r1, r0, [r2]
-+     in Thumb mode assuming that r1 is free.
-+     For ARM mode do the same but only if the starting register
-+     can be made to be even.  */
-   if (const_store
-       && REGNO (operands[0]) == REGNO (operands[1])
-       && INTVAL (operands[4]) != INTVAL (operands[5]))
-@@ -16014,7 +15129,6 @@ gen_operands_ldrd_strd (rtx *operands, bool load,
-       }
-     else if (TARGET_ARM)
-       {
--        return false;
-         int regno = REGNO (operands[0]);
-         if (!peep2_reg_dead_p (4, operands[0]))
-           {
-@@ -16368,7 +15482,7 @@ get_jump_table_size (rtx_jump_table_data *insn)
- 	{
- 	case 1:
- 	  /* Round up size  of TBB table to a halfword boundary.  */
--	  size = (size + 1) & ~(HOST_WIDE_INT)1;
-+	  size = (size + 1) & ~HOST_WIDE_INT_1;
- 	  break;
- 	case 2:
- 	  /* No padding necessary for TBH.  */
-@@ -16837,35 +15951,37 @@ dump_minipool (rtx_insn *scan)
- 	      fputc ('\n', dump_file);
- 	    }
- 
-+	  rtx val = copy_rtx (mp->value);
-+
- 	  switch (GET_MODE_SIZE (mp->mode))
- 	    {
- #ifdef HAVE_consttable_1
- 	    case 1:
--	      scan = emit_insn_after (gen_consttable_1 (mp->value), scan);
-+	      scan = emit_insn_after (gen_consttable_1 (val), scan);
- 	      break;
- 
- #endif
- #ifdef HAVE_consttable_2
- 	    case 2:
--	      scan = emit_insn_after (gen_consttable_2 (mp->value), scan);
-+	      scan = emit_insn_after (gen_consttable_2 (val), scan);
- 	      break;
- 
- #endif
- #ifdef HAVE_consttable_4
- 	    case 4:
--	      scan = emit_insn_after (gen_consttable_4 (mp->value), scan);
-+	      scan = emit_insn_after (gen_consttable_4 (val), scan);
- 	      break;
- 
- #endif
- #ifdef HAVE_consttable_8
- 	    case 8:
--	      scan = emit_insn_after (gen_consttable_8 (mp->value), scan);
-+	      scan = emit_insn_after (gen_consttable_8 (val), scan);
- 	      break;
- 
- #endif
- #ifdef HAVE_consttable_16
- 	    case 16:
--              scan = emit_insn_after (gen_consttable_16 (mp->value), scan);
-+              scan = emit_insn_after (gen_consttable_16 (val), scan);
-               break;
- 
- #endif
-@@ -17269,6 +16385,470 @@ note_invalid_constants (rtx_insn *insn, HOST_WIDE_INT address, int do_pushes)
-   return;
- }
- 
-+/* This function computes the clear mask and PADDING_BITS_TO_CLEAR for structs
-+   and unions in the context of ARMv8-M Security Extensions.  It is used as a
-+   helper function for both 'cmse_nonsecure_call' and 'cmse_nonsecure_entry'
-+   functions.  The PADDING_BITS_TO_CLEAR pointer can be the base to either one
-+   or four masks, depending on whether it is being computed for a
-+   'cmse_nonsecure_entry' return value or a 'cmse_nonsecure_call' argument
-+   respectively.  The tree for the type of the argument or a field within an
-+   argument is passed in ARG_TYPE, the current register this argument or field
-+   starts in is kept in the pointer REGNO and updated accordingly, the bit this
-+   argument or field starts at is passed in STARTING_BIT and the last used bit
-+   is kept in LAST_USED_BIT which is also updated accordingly.  */
-+
-+static unsigned HOST_WIDE_INT
-+comp_not_to_clear_mask_str_un (tree arg_type, int * regno,
-+			       uint32_t * padding_bits_to_clear,
-+			       unsigned starting_bit, int * last_used_bit)
-+
-+{
-+  unsigned HOST_WIDE_INT not_to_clear_reg_mask = 0;
-+
-+  if (TREE_CODE (arg_type) == RECORD_TYPE)
-+    {
-+      unsigned current_bit = starting_bit;
-+      tree field;
-+      long int offset, size;
-+
-+
-+      field = TYPE_FIELDS (arg_type);
-+      while (field)
-+	{
-+	  /* The offset within a structure is always an offset from
-+	     the start of that structure.  Make sure we take that into the
-+	     calculation of the register based offset that we use here.  */
-+	  offset = starting_bit;
-+	  offset += TREE_INT_CST_ELT (DECL_FIELD_BIT_OFFSET (field), 0);
-+	  offset %= 32;
-+
-+	  /* This is the actual size of the field, for bitfields this is the
-+	     bitfield width and not the container size.  */
-+	  size = TREE_INT_CST_ELT (DECL_SIZE (field), 0);
-+
-+	  if (*last_used_bit != offset)
-+	    {
-+	      if (offset < *last_used_bit)
-+		{
-+		  /* This field's offset is before the 'last_used_bit', that
-+		     means this field goes on the next register.  So we need to
-+		     pad the rest of the current register and increase the
-+		     register number.  */
-+		  uint32_t mask;
-+		  mask  = ((uint32_t)-1) - ((uint32_t) 1 << *last_used_bit);
-+		  mask++;
-+
-+		  padding_bits_to_clear[*regno] |= mask;
-+		  not_to_clear_reg_mask |= HOST_WIDE_INT_1U << *regno;
-+		  (*regno)++;
-+		}
-+	      else
-+		{
-+		  /* Otherwise we pad the bits between the last field's end and
-+		     the start of the new field.  */
-+		  uint32_t mask;
-+
-+		  mask = ((uint32_t)-1) >> (32 - offset);
-+		  mask -= ((uint32_t) 1 << *last_used_bit) - 1;
-+		  padding_bits_to_clear[*regno] |= mask;
-+		}
-+	      current_bit = offset;
-+	    }
-+
-+	  /* Calculate further padding bits for inner structs/unions too.  */
-+	  if (RECORD_OR_UNION_TYPE_P (TREE_TYPE (field)))
-+	    {
-+	      *last_used_bit = current_bit;
-+	      not_to_clear_reg_mask
-+		|= comp_not_to_clear_mask_str_un (TREE_TYPE (field), regno,
-+						  padding_bits_to_clear, offset,
-+						  last_used_bit);
-+	    }
-+	  else
-+	    {
-+	      /* Update 'current_bit' with this field's size.  If the
-+		 'current_bit' lies in a subsequent register, update 'regno' and
-+		 reset 'current_bit' to point to the current bit in that new
-+		 register.  */
-+	      current_bit += size;
-+	      while (current_bit >= 32)
-+		{
-+		  current_bit-=32;
-+		  not_to_clear_reg_mask |= HOST_WIDE_INT_1U << *regno;
-+		  (*regno)++;
-+		}
-+	      *last_used_bit = current_bit;
-+	    }
-+
-+	  field = TREE_CHAIN (field);
-+	}
-+      not_to_clear_reg_mask |= HOST_WIDE_INT_1U << *regno;
-+    }
-+  else if (TREE_CODE (arg_type) == UNION_TYPE)
-+    {
-+      tree field, field_t;
-+      int i, regno_t, field_size;
-+      int max_reg = -1;
-+      int max_bit = -1;
-+      uint32_t mask;
-+      uint32_t padding_bits_to_clear_res[NUM_ARG_REGS]
-+	= {-1, -1, -1, -1};
-+
-+      /* To compute the padding bits in a union we only consider bits as
-+	 padding bits if they are always either a padding bit or fall outside a
-+	 fields size for all fields in the union.  */
-+      field = TYPE_FIELDS (arg_type);
-+      while (field)
-+	{
-+	  uint32_t padding_bits_to_clear_t[NUM_ARG_REGS]
-+	    = {0U, 0U, 0U, 0U};
-+	  int last_used_bit_t = *last_used_bit;
-+	  regno_t = *regno;
-+	  field_t = TREE_TYPE (field);
-+
-+	  /* If the field's type is either a record or a union make sure to
-+	     compute their padding bits too.  */
-+	  if (RECORD_OR_UNION_TYPE_P (field_t))
-+	    not_to_clear_reg_mask
-+	      |= comp_not_to_clear_mask_str_un (field_t, &regno_t,
-+						&padding_bits_to_clear_t[0],
-+						starting_bit, &last_used_bit_t);
-+	  else
-+	    {
-+	      field_size = TREE_INT_CST_ELT (DECL_SIZE (field), 0);
-+	      regno_t = (field_size / 32) + *regno;
-+	      last_used_bit_t = (starting_bit + field_size) % 32;
-+	    }
-+
-+	  for (i = *regno; i < regno_t; i++)
-+	    {
-+	      /* For all but the last register used by this field only keep the
-+		 padding bits that were padding bits in this field.  */
-+	      padding_bits_to_clear_res[i] &= padding_bits_to_clear_t[i];
-+	    }
-+
-+	    /* For the last register, keep all padding bits that were padding
-+	       bits in this field and any padding bits that are still valid
-+	       as padding bits but fall outside of this field's size.  */
-+	    mask = (((uint32_t) -1) - ((uint32_t) 1 << last_used_bit_t)) + 1;
-+	    padding_bits_to_clear_res[regno_t]
-+	      &= padding_bits_to_clear_t[regno_t] | mask;
-+
-+	  /* Update the maximum size of the fields in terms of registers used
-+	     ('max_reg') and the 'last_used_bit' in said register.  */
-+	  if (max_reg < regno_t)
-+	    {
-+	      max_reg = regno_t;
-+	      max_bit = last_used_bit_t;
-+	    }
-+	  else if (max_reg == regno_t && max_bit < last_used_bit_t)
-+	    max_bit = last_used_bit_t;
-+
-+	  field = TREE_CHAIN (field);
-+	}
-+
-+      /* Update the current padding_bits_to_clear using the intersection of the
-+	 padding bits of all the fields.  */
-+      for (i=*regno; i < max_reg; i++)
-+	padding_bits_to_clear[i] |= padding_bits_to_clear_res[i];
-+
-+      /* Do not keep trailing padding bits, we do not know yet whether this
-+	 is the end of the argument.  */
-+      mask = ((uint32_t) 1 << max_bit) - 1;
-+      padding_bits_to_clear[max_reg]
-+	|= padding_bits_to_clear_res[max_reg] & mask;
-+
-+      *regno = max_reg;
-+      *last_used_bit = max_bit;
-+    }
-+  else
-+    /* This function should only be used for structs and unions.  */
-+    gcc_unreachable ();
-+
-+  return not_to_clear_reg_mask;
-+}
-+
-+/* In the context of ARMv8-M Security Extensions, this function is used for both
-+   'cmse_nonsecure_call' and 'cmse_nonsecure_entry' functions to compute what
-+   registers are used when returning or passing arguments, which is then
-+   returned as a mask.  It will also compute a mask to indicate padding/unused
-+   bits for each of these registers, and passes this through the
-+   PADDING_BITS_TO_CLEAR pointer.  The tree of the argument type is passed in
-+   ARG_TYPE, the rtl representation of the argument is passed in ARG_RTX and
-+   the starting register used to pass this argument or return value is passed
-+   in REGNO.  It makes use of 'comp_not_to_clear_mask_str_un' to compute these
-+   for struct and union types.  */
-+
-+static unsigned HOST_WIDE_INT
-+compute_not_to_clear_mask (tree arg_type, rtx arg_rtx, int regno,
-+			     uint32_t * padding_bits_to_clear)
-+
-+{
-+  int last_used_bit = 0;
-+  unsigned HOST_WIDE_INT not_to_clear_mask;
-+
-+  if (RECORD_OR_UNION_TYPE_P (arg_type))
-+    {
-+      not_to_clear_mask
-+	= comp_not_to_clear_mask_str_un (arg_type, &regno,
-+					 padding_bits_to_clear, 0,
-+					 &last_used_bit);
-+
-+
-+      /* If the 'last_used_bit' is not zero, that means we are still using a
-+	 part of the last 'regno'.  In such cases we must clear the trailing
-+	 bits.  Otherwise we are not using regno and we should mark it as to
-+	 clear.  */
-+      if (last_used_bit != 0)
-+	padding_bits_to_clear[regno]
-+	  |= ((uint32_t)-1) - ((uint32_t) 1 << last_used_bit) + 1;
-+      else
-+	not_to_clear_mask &= ~(HOST_WIDE_INT_1U << regno);
-+    }
-+  else
-+    {
-+      not_to_clear_mask = 0;
-+      /* We are not dealing with structs nor unions.  So these arguments may be
-+	 passed in floating point registers too.  In some cases a BLKmode is
-+	 used when returning or passing arguments in multiple VFP registers.  */
-+      if (GET_MODE (arg_rtx) == BLKmode)
-+	{
-+	  int i, arg_regs;
-+	  rtx reg;
-+
-+	  /* This should really only occur when dealing with the hard-float
-+	     ABI.  */
-+	  gcc_assert (TARGET_HARD_FLOAT_ABI);
-+
-+	  for (i = 0; i < XVECLEN (arg_rtx, 0); i++)
-+	    {
-+	      reg = XEXP (XVECEXP (arg_rtx, 0, i), 0);
-+	      gcc_assert (REG_P (reg));
-+
-+	      not_to_clear_mask |= HOST_WIDE_INT_1U << REGNO (reg);
-+
-+	      /* If we are dealing with DF mode, make sure we don't
-+		 clear either of the registers it addresses.  */
-+	      arg_regs = ARM_NUM_REGS (GET_MODE (reg));
-+	      if (arg_regs > 1)
-+		{
-+		  unsigned HOST_WIDE_INT mask;
-+		  mask = HOST_WIDE_INT_1U << (REGNO (reg) + arg_regs);
-+		  mask -= HOST_WIDE_INT_1U << REGNO (reg);
-+		  not_to_clear_mask |= mask;
-+		}
-+	    }
-+	}
-+      else
-+	{
-+	  /* Otherwise we can rely on the MODE to determine how many registers
-+	     are being used by this argument.  */
-+	  int arg_regs = ARM_NUM_REGS (GET_MODE (arg_rtx));
-+	  not_to_clear_mask |= HOST_WIDE_INT_1U << REGNO (arg_rtx);
-+	  if (arg_regs > 1)
-+	    {
-+	      unsigned HOST_WIDE_INT
-+	      mask = HOST_WIDE_INT_1U << (REGNO (arg_rtx) + arg_regs);
-+	      mask -= HOST_WIDE_INT_1U << REGNO (arg_rtx);
-+	      not_to_clear_mask |= mask;
-+	    }
-+	}
-+    }
-+
-+  return not_to_clear_mask;
-+}
-+
-+/* Saves callee saved registers, clears callee saved registers and caller saved
-+   registers not used to pass arguments before a cmse_nonsecure_call.  And
-+   restores the callee saved registers after.  */
-+
-+static void
-+cmse_nonsecure_call_clear_caller_saved (void)
-+{
-+  basic_block bb;
-+
-+  FOR_EACH_BB_FN (bb, cfun)
-+    {
-+      rtx_insn *insn;
-+
-+      FOR_BB_INSNS (bb, insn)
-+	{
-+	  uint64_t to_clear_mask, float_mask;
-+	  rtx_insn *seq;
-+	  rtx pat, call, unspec, reg, cleared_reg, tmp;
-+	  unsigned int regno, maxregno;
-+	  rtx address;
-+	  CUMULATIVE_ARGS args_so_far_v;
-+	  cumulative_args_t args_so_far;
-+	  tree arg_type, fntype;
-+	  bool using_r4, first_param = true;
-+	  function_args_iterator args_iter;
-+	  uint32_t padding_bits_to_clear[4] = {0U, 0U, 0U, 0U};
-+	  uint32_t * padding_bits_to_clear_ptr = &padding_bits_to_clear[0];
-+
-+	  if (!NONDEBUG_INSN_P (insn))
-+	    continue;
-+
-+	  if (!CALL_P (insn))
-+	    continue;
-+
-+	  pat = PATTERN (insn);
-+	  gcc_assert (GET_CODE (pat) == PARALLEL && XVECLEN (pat, 0) > 0);
-+	  call = XVECEXP (pat, 0, 0);
-+
-+	  /* Get the real call RTX if the insn sets a value, ie. returns.  */
-+	  if (GET_CODE (call) == SET)
-+	      call = SET_SRC (call);
-+
-+	  /* Check if it is a cmse_nonsecure_call.  */
-+	  unspec = XEXP (call, 0);
-+	  if (GET_CODE (unspec) != UNSPEC
-+	      || XINT (unspec, 1) != UNSPEC_NONSECURE_MEM)
-+	    continue;
-+
-+	  /* Determine the caller-saved registers we need to clear.  */
-+	  to_clear_mask = (1LL << (NUM_ARG_REGS)) - 1;
-+	  maxregno = NUM_ARG_REGS - 1;
-+	  /* Only look at the caller-saved floating point registers in case of
-+	     -mfloat-abi=hard.  For -mfloat-abi=softfp we will be using the
-+	     lazy store and loads which clear both caller- and callee-saved
-+	     registers.  */
-+	  if (TARGET_HARD_FLOAT_ABI)
-+	    {
-+	      float_mask = (1LL << (D7_VFP_REGNUM + 1)) - 1;
-+	      float_mask &= ~((1LL << FIRST_VFP_REGNUM) - 1);
-+	      to_clear_mask |= float_mask;
-+	      maxregno = D7_VFP_REGNUM;
-+	    }
-+
-+	  /* Make sure the register used to hold the function address is not
-+	     cleared.  */
-+	  address = RTVEC_ELT (XVEC (unspec, 0), 0);
-+	  gcc_assert (MEM_P (address));
-+	  gcc_assert (REG_P (XEXP (address, 0)));
-+	  to_clear_mask &= ~(1LL << REGNO (XEXP (address, 0)));
-+
-+	  /* Set basic block of call insn so that df rescan is performed on
-+	     insns inserted here.  */
-+	  set_block_for_insn (insn, bb);
-+	  df_set_flags (DF_DEFER_INSN_RESCAN);
-+	  start_sequence ();
-+
-+	  /* Make sure the scheduler doesn't schedule other insns beyond
-+	     here.  */
-+	  emit_insn (gen_blockage ());
-+
-+	  /* Walk through all arguments and clear registers appropriately.
-+	  */
-+	  fntype = TREE_TYPE (MEM_EXPR (address));
-+	  arm_init_cumulative_args (&args_so_far_v, fntype, NULL_RTX,
-+				    NULL_TREE);
-+	  args_so_far = pack_cumulative_args (&args_so_far_v);
-+	  FOREACH_FUNCTION_ARGS (fntype, arg_type, args_iter)
-+	    {
-+	      rtx arg_rtx;
-+	      machine_mode arg_mode = TYPE_MODE (arg_type);
-+
-+	      if (VOID_TYPE_P (arg_type))
-+		continue;
-+
-+	      if (!first_param)
-+		arm_function_arg_advance (args_so_far, arg_mode, arg_type,
-+					  true);
-+
-+	      arg_rtx = arm_function_arg (args_so_far, arg_mode, arg_type,
-+					  true);
-+	      gcc_assert (REG_P (arg_rtx));
-+	      to_clear_mask
-+		&= ~compute_not_to_clear_mask (arg_type, arg_rtx,
-+					       REGNO (arg_rtx),
-+					       padding_bits_to_clear_ptr);
-+
-+	      first_param = false;
-+	    }
-+
-+	  /* Clear padding bits where needed.  */
-+	  cleared_reg = XEXP (address, 0);
-+	  reg = gen_rtx_REG (SImode, IP_REGNUM);
-+	  using_r4 = false;
-+	  for (regno = R0_REGNUM; regno < NUM_ARG_REGS; regno++)
-+	    {
-+	      if (padding_bits_to_clear[regno] == 0)
-+		continue;
-+
-+	      /* If this is a Thumb-1 target copy the address of the function
-+		 we are calling from 'r4' into 'ip' such that we can use r4 to
-+		 clear the unused bits in the arguments.  */
-+	      if (TARGET_THUMB1 && !using_r4)
-+		{
-+		  using_r4 =  true;
-+		  reg = cleared_reg;
-+		  emit_move_insn (gen_rtx_REG (SImode, IP_REGNUM),
-+					  reg);
-+		}
-+
-+	      tmp = GEN_INT ((((~padding_bits_to_clear[regno]) << 16u) >> 16u));
-+	      emit_move_insn (reg, tmp);
-+	      /* Also fill the top half of the negated
-+		 padding_bits_to_clear.  */
-+	      if (((~padding_bits_to_clear[regno]) >> 16) > 0)
-+		{
-+		  tmp = GEN_INT ((~padding_bits_to_clear[regno]) >> 16);
-+		  emit_insn (gen_rtx_SET (gen_rtx_ZERO_EXTRACT (SImode, reg,
-+								GEN_INT (16),
-+								GEN_INT (16)),
-+					  tmp));
-+		}
-+
-+	      emit_insn (gen_andsi3 (gen_rtx_REG (SImode, regno),
-+				     gen_rtx_REG (SImode, regno),
-+				     reg));
-+
-+	    }
-+	  if (using_r4)
-+	    emit_move_insn (cleared_reg,
-+			    gen_rtx_REG (SImode, IP_REGNUM));
-+
-+	  /* We use right shift and left shift to clear the LSB of the address
-+	     we jump to instead of using bic, to avoid having to use an extra
-+	     register on Thumb-1.  */
-+	  tmp = gen_rtx_LSHIFTRT (SImode, cleared_reg, const1_rtx);
-+	  emit_insn (gen_rtx_SET (cleared_reg, tmp));
-+	  tmp = gen_rtx_ASHIFT (SImode, cleared_reg, const1_rtx);
-+	  emit_insn (gen_rtx_SET (cleared_reg, tmp));
-+
-+	  /* Clearing all registers that leak before doing a non-secure
-+	     call.  */
-+	  for (regno = R0_REGNUM; regno <= maxregno; regno++)
-+	    {
-+	      if (!(to_clear_mask & (1LL << regno)))
-+		continue;
-+
-+	      /* If regno is an even vfp register and its successor is also to
-+		 be cleared, use vmov.  */
-+	      if (IS_VFP_REGNUM (regno))
-+		{
-+		  if (TARGET_VFP_DOUBLE
-+		      && VFP_REGNO_OK_FOR_DOUBLE (regno)
-+		      && to_clear_mask & (1LL << (regno + 1)))
-+		    emit_move_insn (gen_rtx_REG (DFmode, regno++),
-+				    CONST0_RTX (DFmode));
-+		  else
-+		    emit_move_insn (gen_rtx_REG (SFmode, regno),
-+				    CONST0_RTX (SFmode));
-+		}
-+	      else
-+		emit_move_insn (gen_rtx_REG (SImode, regno), cleared_reg);
-+	    }
-+
-+	  seq = get_insns ();
-+	  end_sequence ();
-+	  emit_insn_before (seq, insn);
-+
-+	}
-+    }
-+}
-+
- /* Rewrite move insn into subtract of 0 if the condition codes will
-    be useful in next conditional jump insn.  */
- 
-@@ -17569,6 +17149,8 @@ arm_reorg (void)
-   HOST_WIDE_INT address = 0;
-   Mfix * fix;
- 
-+  if (use_cmse)
-+    cmse_nonsecure_call_clear_caller_saved ();
-   if (TARGET_THUMB1)
-     thumb1_reorg ();
-   else if (TARGET_THUMB2)
-@@ -17941,6 +17523,23 @@ vfp_emit_fstmd (int base_reg, int count)
-   return count * 8;
- }
- 
-+/* Returns true if -mcmse has been passed and the function pointed to by 'addr'
-+   has the cmse_nonsecure_call attribute and returns false otherwise.  */
-+
-+bool
-+detect_cmse_nonsecure_call (tree addr)
-+{
-+  if (!addr)
-+    return FALSE;
-+
-+  tree fntype = TREE_TYPE (addr);
-+  if (use_cmse && lookup_attribute ("cmse_nonsecure_call",
-+				    TYPE_ATTRIBUTES (fntype)))
-+    return TRUE;
-+  return FALSE;
-+}
-+
-+
- /* Emit a call instruction with pattern PAT.  ADDR is the address of
-    the call target.  */
- 
-@@ -18600,6 +18199,8 @@ output_move_vfp (rtx *operands)
-   rtx reg, mem, addr, ops[2];
-   int load = REG_P (operands[0]);
-   int dp = GET_MODE_SIZE (GET_MODE (operands[0])) == 8;
-+  int sp = (!TARGET_VFP_FP16INST
-+	    || GET_MODE_SIZE (GET_MODE (operands[0])) == 4);
-   int integer_p = GET_MODE_CLASS (GET_MODE (operands[0])) == MODE_INT;
-   const char *templ;
-   char buff[50];
-@@ -18612,8 +18213,10 @@ output_move_vfp (rtx *operands)
- 
-   gcc_assert (REG_P (reg));
-   gcc_assert (IS_VFP_REGNUM (REGNO (reg)));
--  gcc_assert (mode == SFmode
-+  gcc_assert ((mode == HFmode && TARGET_HARD_FLOAT)
-+	      || mode == SFmode
- 	      || mode == DFmode
-+	      || mode == HImode
- 	      || mode == SImode
- 	      || mode == DImode
-               || (TARGET_NEON && VALID_NEON_DREG_MODE (mode)));
-@@ -18644,7 +18247,7 @@ output_move_vfp (rtx *operands)
- 
-   sprintf (buff, templ,
- 	   load ? "ld" : "st",
--	   dp ? "64" : "32",
-+	   dp ? "64" : sp ? "32" : "16",
- 	   dp ? "P" : "",
- 	   integer_p ? "\t%@ int" : "");
-   output_asm_insn (buff, ops);
-@@ -19070,7 +18673,8 @@ shift_op (rtx op, HOST_WIDE_INT *amountp)
- 	  return NULL;
- 	}
- 
--      *amountp = int_log2 (*amountp);
-+      *amountp = exact_log2 (*amountp);
-+      gcc_assert (IN_RANGE (*amountp, 0, 31));
-       return ARM_LSL_NAME;
- 
-     default:
-@@ -19102,22 +18706,6 @@ shift_op (rtx op, HOST_WIDE_INT *amountp)
-   return mnem;
- }
- 
--/* Obtain the shift from the POWER of two.  */
--
--static HOST_WIDE_INT
--int_log2 (HOST_WIDE_INT power)
--{
--  HOST_WIDE_INT shift = 0;
--
--  while ((((HOST_WIDE_INT) 1 << shift) & power) == 0)
--    {
--      gcc_assert (shift <= 31);
--      shift++;
--    }
--
--  return shift;
--}
--
- /* Output a .ascii pseudo-op, keeping track of lengths.  This is
-    because /bin/as is horribly restrictive.  The judgement about
-    whether or not each character is 'printable' (and can be output as
-@@ -19474,7 +19062,7 @@ arm_get_vfp_saved_size (void)
- 
-   saved = 0;
-   /* Space for saved VFP registers.  */
--  if (TARGET_HARD_FLOAT && TARGET_VFP)
-+  if (TARGET_HARD_FLOAT)
-     {
-       count = 0;
-       for (regno = FIRST_VFP_REGNUM;
-@@ -19563,6 +19151,7 @@ output_return_instruction (rtx operand, bool really_return, bool reverse,
- 	 (e.g. interworking) then we can load the return address
- 	 directly into the PC.  Otherwise we must load it into LR.  */
-       if (really_return
-+	  && !IS_CMSE_ENTRY (func_type)
- 	  && (IS_INTERRUPT (func_type) || !TARGET_INTERWORK))
- 	return_reg = reg_names[PC_REGNUM];
-       else
-@@ -19703,18 +19292,93 @@ output_return_instruction (rtx operand, bool really_return, bool reverse,
- 	  break;
- 
- 	default:
-+	  if (IS_CMSE_ENTRY (func_type))
-+	    {
-+	      /* Check if we have to clear the 'GE bits' which is only used if
-+		 parallel add and subtraction instructions are available.  */
-+	      if (TARGET_INT_SIMD)
-+		snprintf (instr, sizeof (instr),
-+			  "msr%s\tAPSR_nzcvqg, %%|lr", conditional);
-+	      else
-+		snprintf (instr, sizeof (instr),
-+			  "msr%s\tAPSR_nzcvq, %%|lr", conditional);
-+
-+	      output_asm_insn (instr, & operand);
-+	      if (TARGET_HARD_FLOAT && !TARGET_THUMB1)
-+		{
-+		  /* Clear the cumulative exception-status bits (0-4,7) and the
-+		     condition code bits (28-31) of the FPSCR.  We need to
-+		     remember to clear the first scratch register used (IP) and
-+		     save and restore the second (r4).  */
-+		  snprintf (instr, sizeof (instr), "push\t{%%|r4}");
-+		  output_asm_insn (instr, & operand);
-+		  snprintf (instr, sizeof (instr), "vmrs\t%%|ip, fpscr");
-+		  output_asm_insn (instr, & operand);
-+		  snprintf (instr, sizeof (instr), "movw\t%%|r4, #65376");
-+		  output_asm_insn (instr, & operand);
-+		  snprintf (instr, sizeof (instr), "movt\t%%|r4, #4095");
-+		  output_asm_insn (instr, & operand);
-+		  snprintf (instr, sizeof (instr), "and\t%%|ip, %%|r4");
-+		  output_asm_insn (instr, & operand);
-+		  snprintf (instr, sizeof (instr), "vmsr\tfpscr, %%|ip");
-+		  output_asm_insn (instr, & operand);
-+		  snprintf (instr, sizeof (instr), "pop\t{%%|r4}");
-+		  output_asm_insn (instr, & operand);
-+		  snprintf (instr, sizeof (instr), "mov\t%%|ip, %%|lr");
-+		  output_asm_insn (instr, & operand);
-+		}
-+	      snprintf (instr, sizeof (instr), "bxns\t%%|lr");
-+	    }
- 	  /* Use bx if it's available.  */
--	  if (arm_arch5 || arm_arch4t)
-+	  else if (arm_arch5 || arm_arch4t)
- 	    sprintf (instr, "bx%s\t%%|lr", conditional);
- 	  else
- 	    sprintf (instr, "mov%s\t%%|pc, %%|lr", conditional);
- 	  break;
- 	}
- 
--      output_asm_insn (instr, & operand);
-+      output_asm_insn (instr, & operand);
-+    }
-+
-+  return "";
-+}
-+
-+/* Output in FILE asm statements needed to declare the NAME of the function
-+   defined by its DECL node.  */
-+
-+void
-+arm_asm_declare_function_name (FILE *file, const char *name, tree decl)
-+{
-+  size_t cmse_name_len;
-+  char *cmse_name = 0;
-+  char cmse_prefix[] = "__acle_se_";
-+
-+  /* When compiling with ARMv8-M Security Extensions enabled, we should print an
-+     extra function label for each function with the 'cmse_nonsecure_entry'
-+     attribute.  This extra function label should be prepended with
-+     '__acle_se_', telling the linker that it needs to create secure gateway
-+     veneers for this function.  */
-+  if (use_cmse && lookup_attribute ("cmse_nonsecure_entry",
-+				    DECL_ATTRIBUTES (decl)))
-+    {
-+      cmse_name_len = sizeof (cmse_prefix) + strlen (name);
-+      cmse_name = XALLOCAVEC (char, cmse_name_len);
-+      snprintf (cmse_name, cmse_name_len, "%s%s", cmse_prefix, name);
-+      targetm.asm_out.globalize_label (file, cmse_name);
-+
-+      ARM_DECLARE_FUNCTION_NAME (file, cmse_name, decl);
-+      ASM_OUTPUT_TYPE_DIRECTIVE (file, cmse_name, "function");
-     }
- 
--  return "";
-+  ARM_DECLARE_FUNCTION_NAME (file, name, decl);
-+  ASM_OUTPUT_TYPE_DIRECTIVE (file, name, "function");
-+  ASM_DECLARE_RESULT (file, DECL_RESULT (decl));
-+  ASM_OUTPUT_LABEL (file, name);
-+
-+  if (cmse_name)
-+    ASM_OUTPUT_LABEL (file, cmse_name);
-+
-+  ARM_OUTPUT_FN_UNWIND (file, TRUE);
- }
- 
- /* Write the function name into the code section, directly preceding
-@@ -19766,10 +19430,6 @@ arm_output_function_prologue (FILE *f, HOST_WIDE_INT frame_size)
- {
-   unsigned long func_type;
- 
--  /* ??? Do we want to print some of the below anyway?  */
--  if (TARGET_THUMB1)
--    return;
--
-   /* Sanity check.  */
-   gcc_assert (!arm_ccfsm_state && !arm_target_insn);
- 
-@@ -19804,6 +19464,8 @@ arm_output_function_prologue (FILE *f, HOST_WIDE_INT frame_size)
-     asm_fprintf (f, "\t%@ Nested: function declared inside another function.\n");
-   if (IS_STACKALIGN (func_type))
-     asm_fprintf (f, "\t%@ Stack Align: May be called with mis-aligned SP.\n");
-+  if (IS_CMSE_ENTRY (func_type))
-+    asm_fprintf (f, "\t%@ Non-secure entry function: called from non-secure code.\n");
- 
-   asm_fprintf (f, "\t%@ args = %d, pretend = %d, frame = %wd\n",
- 	       crtl->args.size,
-@@ -20473,7 +20135,7 @@ arm_emit_vfp_multi_reg_pop (int first_reg, int num_regs, rtx base_reg)
-   REG_NOTES (par) = dwarf;
- 
-   /* Make sure cfa doesn't leave with IP_REGNUM to allow unwinding fron FP.  */
--  if (TARGET_VFP && REGNO (base_reg) == IP_REGNUM)
-+  if (REGNO (base_reg) == IP_REGNUM)
-     {
-       RTX_FRAME_RELATED_P (par) = 1;
-       add_reg_note (par, REG_CFA_DEF_CFA, hard_frame_pointer_rtx);
-@@ -20934,7 +20596,7 @@ arm_get_frame_offsets (void)
-       func_type = arm_current_func_type ();
-       /* Space for saved VFP registers.  */
-       if (! IS_VOLATILE (func_type)
--	  && TARGET_HARD_FLOAT && TARGET_VFP)
-+	  && TARGET_HARD_FLOAT)
- 	saved += arm_get_vfp_saved_size ();
-     }
-   else /* TARGET_THUMB1 */
-@@ -21155,7 +20817,7 @@ arm_save_coproc_regs(void)
- 	saved_size += 8;
-       }
- 
--  if (TARGET_HARD_FLOAT && TARGET_VFP)
-+  if (TARGET_HARD_FLOAT)
-     {
-       start_reg = FIRST_VFP_REGNUM;
- 
-@@ -22941,6 +22603,8 @@ maybe_get_arm_condition_code (rtx comparison)
- 	{
- 	case LTU: return ARM_CS;
- 	case GEU: return ARM_CC;
-+	case NE: return ARM_CS;
-+	case EQ: return ARM_CC;
- 	default: return ARM_NV;
- 	}
- 
-@@ -22966,6 +22630,14 @@ maybe_get_arm_condition_code (rtx comparison)
- 	default: return ARM_NV;
- 	}
- 
-+    case CC_Vmode:
-+      switch (comp_code)
-+	{
-+	case NE: return ARM_VS;
-+	case EQ: return ARM_VC;
-+	default: return ARM_NV;
-+	}
-+
-     case CCmode:
-       switch (comp_code)
- 	{
-@@ -23396,7 +23068,7 @@ arm_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
- {
-   if (GET_MODE_CLASS (mode) == MODE_CC)
-     return (regno == CC_REGNUM
--	    || (TARGET_HARD_FLOAT && TARGET_VFP
-+	    || (TARGET_HARD_FLOAT
- 		&& regno == VFPCC_REGNUM));
- 
-   if (regno == CC_REGNUM && GET_MODE_CLASS (mode) != MODE_CC)
-@@ -23410,8 +23082,7 @@ arm_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
-        start of an even numbered register pair.  */
-     return (ARM_NUM_REGS (mode) < 2) || (regno < LAST_LO_REGNUM);
- 
--  if (TARGET_HARD_FLOAT && TARGET_VFP
--      && IS_VFP_REGNUM (regno))
-+  if (TARGET_HARD_FLOAT && IS_VFP_REGNUM (regno))
-     {
-       if (mode == SFmode || mode == SImode)
- 	return VFP_REGNO_OK_FOR_SINGLE (regno);
-@@ -23419,10 +23090,12 @@ arm_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
-       if (mode == DFmode)
- 	return VFP_REGNO_OK_FOR_DOUBLE (regno);
- 
--      /* VFP registers can hold HFmode values, but there is no point in
--	 putting them there unless we have hardware conversion insns. */
-       if (mode == HFmode)
--	return TARGET_FP16 && VFP_REGNO_OK_FOR_SINGLE (regno);
-+	return VFP_REGNO_OK_FOR_SINGLE (regno);
-+
-+      /* VFP registers can hold HImode values.  */
-+      if (mode == HImode)
-+	return VFP_REGNO_OK_FOR_SINGLE (regno);
- 
-       if (TARGET_NEON)
-         return (VALID_NEON_DREG_MODE (mode) && VFP_REGNO_OK_FOR_DOUBLE (regno))
-@@ -23626,26 +23299,6 @@ arm_debugger_arg_offset (int value, rtx addr)
-   return value;
- }
- 
--/* Implement TARGET_INVALID_PARAMETER_TYPE.  */
--
--static const char *
--arm_invalid_parameter_type (const_tree t)
--{
--  if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16)
--    return N_("function parameters cannot have __fp16 type");
--  return NULL;
--}
--
--/* Implement TARGET_INVALID_PARAMETER_TYPE.  */
--
--static const char *
--arm_invalid_return_type (const_tree t)
--{
--  if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16)
--    return N_("functions cannot return __fp16 type");
--  return NULL;
--}
--
- /* Implement TARGET_PROMOTED_TYPE.  */
- 
- static tree
-@@ -23885,8 +23538,8 @@ thumb_pop (FILE *f, unsigned long mask)
-   if (mask & (1 << PC_REGNUM))
-     {
-       /* Catch popping the PC.  */
--      if (TARGET_INTERWORK || TARGET_BACKTRACE
--	  || crtl->calls_eh_return)
-+      if (TARGET_INTERWORK || TARGET_BACKTRACE || crtl->calls_eh_return
-+	  || IS_CMSE_ENTRY (arm_current_func_type ()))
- 	{
- 	  /* The PC is never poped directly, instead
- 	     it is popped into r3 and then BX is used.  */
-@@ -23947,7 +23600,14 @@ thumb_exit (FILE *f, int reg_containing_return_addr)
-       if (crtl->calls_eh_return)
- 	asm_fprintf (f, "\tadd\t%r, %r\n", SP_REGNUM, ARM_EH_STACKADJ_REGNUM);
- 
--      asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr);
-+      if (IS_CMSE_ENTRY (arm_current_func_type ()))
-+	{
-+	  asm_fprintf (f, "\tmsr\tAPSR_nzcvq, %r\n",
-+		       reg_containing_return_addr);
-+	  asm_fprintf (f, "\tbxns\t%r\n", reg_containing_return_addr);
-+	}
-+      else
-+	asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr);
-       return;
-     }
-   /* Otherwise if we are not supporting interworking and we have not created
-@@ -23956,7 +23616,8 @@ thumb_exit (FILE *f, int reg_containing_return_addr)
-   else if (!TARGET_INTERWORK
- 	   && !TARGET_BACKTRACE
- 	   && !is_called_in_ARM_mode (current_function_decl)
--	   && !crtl->calls_eh_return)
-+	   && !crtl->calls_eh_return
-+	   && !IS_CMSE_ENTRY (arm_current_func_type ()))
-     {
-       asm_fprintf (f, "\tpop\t{%r}\n", PC_REGNUM);
-       return;
-@@ -24179,7 +23840,21 @@ thumb_exit (FILE *f, int reg_containing_return_addr)
-     asm_fprintf (f, "\tadd\t%r, %r\n", SP_REGNUM, ARM_EH_STACKADJ_REGNUM);
- 
-   /* Return to caller.  */
--  asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr);
-+  if (IS_CMSE_ENTRY (arm_current_func_type ()))
-+    {
-+      /* This is for the cases where LR is not being used to contain the return
-+         address.  It may therefore contain information that we might not want
-+	 to leak, hence it must be cleared.  The value in R0 will never be a
-+	 secret at this point, so it is safe to use it, see the clearing code
-+	 in 'cmse_nonsecure_entry_clear_before_return'.  */
-+      if (reg_containing_return_addr != LR_REGNUM)
-+	asm_fprintf (f, "\tmov\tlr, r0\n");
-+
-+      asm_fprintf (f, "\tmsr\tAPSR_nzcvq, %r\n", reg_containing_return_addr);
-+      asm_fprintf (f, "\tbxns\t%r\n", reg_containing_return_addr);
-+    }
-+  else
-+    asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr);
- }
- 
- /* Scan INSN just before assembler is output for it.
-@@ -25044,6 +24719,149 @@ thumb1_expand_prologue (void)
-     cfun->machine->lr_save_eliminated = 0;
- }
- 
-+/* Clear caller saved registers not used to pass return values and leaked
-+   condition flags before exiting a cmse_nonsecure_entry function.  */
-+
-+void
-+cmse_nonsecure_entry_clear_before_return (void)
-+{
-+  uint64_t to_clear_mask[2];
-+  uint32_t padding_bits_to_clear = 0;
-+  uint32_t * padding_bits_to_clear_ptr = &padding_bits_to_clear;
-+  int regno, maxregno = IP_REGNUM;
-+  tree result_type;
-+  rtx result_rtl;
-+
-+  to_clear_mask[0] = (1ULL << (NUM_ARG_REGS)) - 1;
-+  to_clear_mask[0] |= (1ULL << IP_REGNUM);
-+
-+  /* If we are not dealing with -mfloat-abi=soft we will need to clear VFP
-+     registers.  We also check that TARGET_HARD_FLOAT and !TARGET_THUMB1 hold
-+     to make sure the instructions used to clear them are present.  */
-+  if (TARGET_HARD_FLOAT && !TARGET_THUMB1)
-+    {
-+      uint64_t float_mask = (1ULL << (D7_VFP_REGNUM + 1)) - 1;
-+      maxregno = LAST_VFP_REGNUM;
-+
-+      float_mask &= ~((1ULL << FIRST_VFP_REGNUM) - 1);
-+      to_clear_mask[0] |= float_mask;
-+
-+      float_mask = (1ULL << (maxregno - 63)) - 1;
-+      to_clear_mask[1] = float_mask;
-+
-+      /* Make sure we don't clear the two scratch registers used to clear the
-+	 relevant FPSCR bits in output_return_instruction.  */
-+      emit_use (gen_rtx_REG (SImode, IP_REGNUM));
-+      to_clear_mask[0] &= ~(1ULL << IP_REGNUM);
-+      emit_use (gen_rtx_REG (SImode, 4));
-+      to_clear_mask[0] &= ~(1ULL << 4);
-+    }
-+
-+  /* If the user has defined registers to be caller saved, these are no longer
-+     restored by the function before returning and must thus be cleared for
-+     security purposes.  */
-+  for (regno = NUM_ARG_REGS; regno < LAST_VFP_REGNUM; regno++)
-+    {
-+      /* We do not touch registers that can be used to pass arguments as per
-+	 the AAPCS, since these should never be made callee-saved by user
-+	 options.  */
-+      if (IN_RANGE (regno, FIRST_VFP_REGNUM, D7_VFP_REGNUM))
-+	continue;
-+      if (IN_RANGE (regno, IP_REGNUM, PC_REGNUM))
-+	continue;
-+      if (call_used_regs[regno])
-+	to_clear_mask[regno / 64] |= (1ULL << (regno % 64));
-+    }
-+
-+  /* Make sure we do not clear the registers used to return the result in.  */
-+  result_type = TREE_TYPE (DECL_RESULT (current_function_decl));
-+  if (!VOID_TYPE_P (result_type))
-+    {
-+      result_rtl = arm_function_value (result_type, current_function_decl, 0);
-+
-+      /* No need to check that we return in registers, because we don't
-+	 support returning on stack yet.  */
-+      to_clear_mask[0]
-+	&= ~compute_not_to_clear_mask (result_type, result_rtl, 0,
-+				       padding_bits_to_clear_ptr);
-+    }
-+
-+  if (padding_bits_to_clear != 0)
-+    {
-+      rtx reg_rtx;
-+      /* Padding bits to clear is not 0 so we know we are dealing with
-+	 returning a composite type, which only uses r0.  Let's make sure that
-+	 r1-r3 is cleared too, we will use r1 as a scratch register.  */
-+      gcc_assert ((to_clear_mask[0] & 0xe) == 0xe);
-+
-+      reg_rtx = gen_rtx_REG (SImode, R1_REGNUM);
-+
-+      /* Fill the lower half of the negated padding_bits_to_clear.  */
-+      emit_move_insn (reg_rtx,
-+		      GEN_INT ((((~padding_bits_to_clear) << 16u) >> 16u)));
-+
-+      /* Also fill the top half of the negated padding_bits_to_clear.  */
-+      if (((~padding_bits_to_clear) >> 16) > 0)
-+	emit_insn (gen_rtx_SET (gen_rtx_ZERO_EXTRACT (SImode, reg_rtx,
-+						      GEN_INT (16),
-+						      GEN_INT (16)),
-+				GEN_INT ((~padding_bits_to_clear) >> 16)));
-+
-+      emit_insn (gen_andsi3 (gen_rtx_REG (SImode, R0_REGNUM),
-+			   gen_rtx_REG (SImode, R0_REGNUM),
-+			   reg_rtx));
-+    }
-+
-+  for (regno = R0_REGNUM; regno <= maxregno; regno++)
-+    {
-+      if (!(to_clear_mask[regno / 64] & (1ULL << (regno % 64))))
-+	continue;
-+
-+      if (IS_VFP_REGNUM (regno))
-+	{
-+	  /* If regno is an even vfp register and its successor is also to
-+	     be cleared, use vmov.  */
-+	  if (TARGET_VFP_DOUBLE
-+	      && VFP_REGNO_OK_FOR_DOUBLE (regno)
-+	      && to_clear_mask[regno / 64] & (1ULL << ((regno % 64) + 1)))
-+	    {
-+	      emit_move_insn (gen_rtx_REG (DFmode, regno),
-+			      CONST1_RTX (DFmode));
-+	      emit_use (gen_rtx_REG (DFmode, regno));
-+	      regno++;
-+	    }
-+	  else
-+	    {
-+	      emit_move_insn (gen_rtx_REG (SFmode, regno),
-+			      CONST1_RTX (SFmode));
-+	      emit_use (gen_rtx_REG (SFmode, regno));
-+	    }
-+	}
-+      else
-+	{
-+	  if (TARGET_THUMB1)
-+	    {
-+	      if (regno == R0_REGNUM)
-+		emit_move_insn (gen_rtx_REG (SImode, regno),
-+				const0_rtx);
-+	      else
-+		/* R0 has either been cleared before, see code above, or it
-+		   holds a return value, either way it is not secret
-+		   information.  */
-+		emit_move_insn (gen_rtx_REG (SImode, regno),
-+				gen_rtx_REG (SImode, R0_REGNUM));
-+	      emit_use (gen_rtx_REG (SImode, regno));
-+	    }
-+	  else
-+	    {
-+	      emit_move_insn (gen_rtx_REG (SImode, regno),
-+			      gen_rtx_REG (SImode, LR_REGNUM));
-+	      emit_use (gen_rtx_REG (SImode, regno));
-+	    }
-+	}
-+    }
-+}
-+
- /* Generate pattern *pop_multiple_with_stack_update_and_return if single
-    POP instruction can be generated.  LR should be replaced by PC.  All
-    the checks required are already done by  USE_RETURN_INSN ().  Hence,
-@@ -25065,6 +24883,12 @@ thumb2_expand_return (bool simple_return)
- 
-   if (!simple_return && saved_regs_mask)
-     {
-+      /* TODO: Verify that this path is never taken for cmse_nonsecure_entry
-+	 functions or adapt code to handle according to ACLE.  This path should
-+	 not be reachable for cmse_nonsecure_entry functions though we prefer
-+	 to assert it for now to ensure that future code changes do not silently
-+	 change this behavior.  */
-+      gcc_assert (!IS_CMSE_ENTRY (arm_current_func_type ()));
-       if (num_regs == 1)
-         {
-           rtx par = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (2));
-@@ -25087,6 +24911,8 @@ thumb2_expand_return (bool simple_return)
-     }
-   else
-     {
-+      if (IS_CMSE_ENTRY (arm_current_func_type ()))
-+	cmse_nonsecure_entry_clear_before_return ();
-       emit_jump_insn (simple_return_rtx);
-     }
- }
-@@ -25145,6 +24971,10 @@ thumb1_expand_epilogue (void)
- 
-   if (! df_regs_ever_live_p (LR_REGNUM))
-     emit_use (gen_rtx_REG (SImode, LR_REGNUM));
-+
-+  /* Clear all caller-saved regs that are not used to return.  */
-+  if (IS_CMSE_ENTRY (arm_current_func_type ()))
-+    cmse_nonsecure_entry_clear_before_return ();
- }
- 
- /* Epilogue code for APCS frame.  */
-@@ -25179,7 +25009,7 @@ arm_expand_epilogue_apcs_frame (bool really_return)
-         floats_from_frame += 4;
-       }
- 
--  if (TARGET_HARD_FLOAT && TARGET_VFP)
-+  if (TARGET_HARD_FLOAT)
-     {
-       int start_reg;
-       rtx ip_rtx = gen_rtx_REG (SImode, IP_REGNUM);
-@@ -25425,7 +25255,7 @@ arm_expand_epilogue (bool really_return)
-         }
-     }
- 
--  if (TARGET_HARD_FLOAT && TARGET_VFP)
-+  if (TARGET_HARD_FLOAT)
-     {
-       /* Generate VFP register multi-pop.  */
-       int end_reg = LAST_VFP_REGNUM + 1;
-@@ -25482,6 +25312,7 @@ arm_expand_epilogue (bool really_return)
- 
-       if (ARM_FUNC_TYPE (func_type) != ARM_FT_INTERWORKED
-           && (TARGET_ARM || ARM_FUNC_TYPE (func_type) == ARM_FT_NORMAL)
-+	  && !IS_CMSE_ENTRY (func_type)
-           && !IS_STACKALIGN (func_type)
-           && really_return
-           && crtl->args.pretend_args_size == 0
-@@ -25578,6 +25409,14 @@ arm_expand_epilogue (bool really_return)
- 				   stack_pointer_rtx, stack_pointer_rtx);
-     }
- 
-+    /* Clear all caller-saved regs that are not used to return.  */
-+    if (IS_CMSE_ENTRY (arm_current_func_type ()))
-+      {
-+	/* CMSE_ENTRY always returns.  */
-+	gcc_assert (really_return);
-+	cmse_nonsecure_entry_clear_before_return ();
-+      }
-+
-   if (!really_return)
-     return;
- 
-@@ -25874,13 +25713,6 @@ thumb_reload_out_hi (rtx *operands)
-   emit_insn (gen_thumb_movhi_clobber (operands[0], operands[1], operands[2]));
- }
- 
--/* Handle reading a half-word from memory during reload.  */
--void
--thumb_reload_in_hi (rtx *operands ATTRIBUTE_UNUSED)
--{
--  gcc_unreachable ();
--}
--
- /* Return the length of a function name prefix
-     that starts with the character 'c'.  */
- static int
-@@ -25950,46 +25782,55 @@ arm_emit_eabi_attribute (const char *name, int num, int val)
- void
- arm_print_tune_info (void)
- {
--  asm_fprintf (asm_out_file, "\t@.tune parameters\n");
--  asm_fprintf (asm_out_file, "\t\t@constant_limit:\t%d\n",
-+  asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune parameters\n");
-+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START "constant_limit:\t%d\n",
- 	       current_tune->constant_limit);
--  asm_fprintf (asm_out_file, "\t\t@max_insns_skipped:\t%d\n",
--	       current_tune->max_insns_skipped);
--  asm_fprintf (asm_out_file, "\t\t@prefetch.num_slots:\t%d\n",
--	       current_tune->prefetch.num_slots);
--  asm_fprintf (asm_out_file, "\t\t@prefetch.l1_cache_size:\t%d\n",
-+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START
-+	       "max_insns_skipped:\t%d\n", current_tune->max_insns_skipped);
-+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START
-+	       "prefetch.num_slots:\t%d\n", current_tune->prefetch.num_slots);
-+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START
-+	       "prefetch.l1_cache_size:\t%d\n",
- 	       current_tune->prefetch.l1_cache_size);
--  asm_fprintf (asm_out_file, "\t\t@prefetch.l1_cache_line_size:\t%d\n",
-+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START
-+	       "prefetch.l1_cache_line_size:\t%d\n",
- 	       current_tune->prefetch.l1_cache_line_size);
--  asm_fprintf (asm_out_file, "\t\t@prefer_constant_pool:\t%d\n",
-+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START
-+	       "prefer_constant_pool:\t%d\n",
- 	       (int) current_tune->prefer_constant_pool);
--  asm_fprintf (asm_out_file, "\t\t@branch_cost:\t(s:speed, p:predictable)\n");
--  asm_fprintf (asm_out_file, "\t\t\t\ts&p\tcost\n");
--  asm_fprintf (asm_out_file, "\t\t\t\t00\t%d\n",
-+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START
-+	       "branch_cost:\t(s:speed, p:predictable)\n");
-+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START "\t\ts&p\tcost\n");
-+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START "\t\t00\t%d\n",
- 	       current_tune->branch_cost (false, false));
--  asm_fprintf (asm_out_file, "\t\t\t\t01\t%d\n",
-+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START "\t\t01\t%d\n",
- 	       current_tune->branch_cost (false, true));
--  asm_fprintf (asm_out_file, "\t\t\t\t10\t%d\n",
-+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START "\t\t10\t%d\n",
- 	       current_tune->branch_cost (true, false));
--  asm_fprintf (asm_out_file, "\t\t\t\t11\t%d\n",
-+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START "\t\t11\t%d\n",
- 	       current_tune->branch_cost (true, true));
--  asm_fprintf (asm_out_file, "\t\t@prefer_ldrd_strd:\t%d\n",
-+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START
-+	       "prefer_ldrd_strd:\t%d\n",
- 	       (int) current_tune->prefer_ldrd_strd);
--  asm_fprintf (asm_out_file, "\t\t@logical_op_non_short_circuit:\t[%d,%d]\n",
-+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START
-+	       "logical_op_non_short_circuit:\t[%d,%d]\n",
- 	       (int) current_tune->logical_op_non_short_circuit_thumb,
- 	       (int) current_tune->logical_op_non_short_circuit_arm);
--  asm_fprintf (asm_out_file, "\t\t@prefer_neon_for_64bits:\t%d\n",
-+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START
-+	       "prefer_neon_for_64bits:\t%d\n",
- 	       (int) current_tune->prefer_neon_for_64bits);
--  asm_fprintf (asm_out_file,
--	       "\t\t@disparage_flag_setting_t16_encodings:\t%d\n",
-+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START
-+	       "disparage_flag_setting_t16_encodings:\t%d\n",
- 	       (int) current_tune->disparage_flag_setting_t16_encodings);
--  asm_fprintf (asm_out_file, "\t\t@string_ops_prefer_neon:\t%d\n",
-+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START
-+	       "string_ops_prefer_neon:\t%d\n",
- 	       (int) current_tune->string_ops_prefer_neon);
--  asm_fprintf (asm_out_file, "\t\t@max_insns_inline_memset:\t%d\n",
-+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START
-+	       "max_insns_inline_memset:\t%d\n",
- 	       current_tune->max_insns_inline_memset);
--  asm_fprintf (asm_out_file, "\t\t@fusible_ops:\t%u\n",
-+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START "fusible_ops:\t%u\n",
- 	       current_tune->fusible_ops);
--  asm_fprintf (asm_out_file, "\t\t@sched_autopref:\t%d\n",
-+  asm_fprintf (asm_out_file, "\t\t" ASM_COMMENT_START "sched_autopref:\t%d\n",
- 	       (int) current_tune->sched_autopref);
- }
- 
-@@ -26018,7 +25859,7 @@ arm_file_start (void)
- 	      const char* pos = strchr (arm_selected_arch->name, '+');
- 	      if (pos)
- 		{
--		  char buf[15];
-+		  char buf[32];
- 		  gcc_assert (strlen (arm_selected_arch->name)
- 			      <= sizeof (buf) / sizeof (*pos));
- 		  strncpy (buf, arm_selected_arch->name,
-@@ -26043,7 +25884,7 @@ arm_file_start (void)
-       if (print_tune_info)
- 	arm_print_tune_info ();
- 
--      if (! TARGET_SOFT_FLOAT && TARGET_VFP)
-+      if (! TARGET_SOFT_FLOAT)
- 	{
- 	  if (TARGET_HARD_FLOAT && TARGET_VFP_SINGLE)
- 	    arm_emit_eabi_attribute ("Tag_ABI_HardFP_use", 27, 1);
-@@ -26160,11 +26001,10 @@ arm_internal_label (FILE *stream, const char *prefix, unsigned long labelno)
- 
- /* Output code to add DELTA to the first argument, and then jump
-    to FUNCTION.  Used for C++ multiple inheritance.  */
-+
- static void
--arm_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
--		     HOST_WIDE_INT delta,
--		     HOST_WIDE_INT vcall_offset ATTRIBUTE_UNUSED,
--		     tree function)
-+arm_thumb1_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
-+		     HOST_WIDE_INT, tree function)
- {
-   static int thunk_label = 0;
-   char label[256];
-@@ -26305,6 +26145,76 @@ arm_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
-   final_end_function ();
- }
- 
-+/* MI thunk handling for TARGET_32BIT.  */
-+
-+static void
-+arm32_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
-+		       HOST_WIDE_INT vcall_offset, tree function)
-+{
-+  /* On ARM, this_regno is R0 or R1 depending on
-+     whether the function returns an aggregate or not.
-+  */
-+  int this_regno = (aggregate_value_p (TREE_TYPE (TREE_TYPE (function)),
-+				       function)
-+		    ? R1_REGNUM : R0_REGNUM);
-+
-+  rtx temp = gen_rtx_REG (Pmode, IP_REGNUM);
-+  rtx this_rtx = gen_rtx_REG (Pmode, this_regno);
-+  reload_completed = 1;
-+  emit_note (NOTE_INSN_PROLOGUE_END);
-+
-+  /* Add DELTA to THIS_RTX.  */
-+  if (delta != 0)
-+    arm_split_constant (PLUS, Pmode, NULL_RTX,
-+			delta, this_rtx, this_rtx, false);
-+
-+  /* Add *(*THIS_RTX + VCALL_OFFSET) to THIS_RTX.  */
-+  if (vcall_offset != 0)
-+    {
-+      /* Load *THIS_RTX.  */
-+      emit_move_insn (temp, gen_rtx_MEM (Pmode, this_rtx));
-+      /* Compute *THIS_RTX + VCALL_OFFSET.  */
-+      arm_split_constant (PLUS, Pmode, NULL_RTX, vcall_offset, temp, temp,
-+			  false);
-+      /* Compute *(*THIS_RTX + VCALL_OFFSET).  */
-+      emit_move_insn (temp, gen_rtx_MEM (Pmode, temp));
-+      emit_insn (gen_add3_insn (this_rtx, this_rtx, temp));
-+    }
-+
-+  /* Generate a tail call to the target function.  */
-+  if (!TREE_USED (function))
-+    {
-+      assemble_external (function);
-+      TREE_USED (function) = 1;
-+    }
-+  rtx funexp = XEXP (DECL_RTL (function), 0);
-+  funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
-+  rtx_insn * insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
-+  SIBLING_CALL_P (insn) = 1;
-+
-+  insn = get_insns ();
-+  shorten_branches (insn);
-+  final_start_function (insn, file, 1);
-+  final (insn, file, 1);
-+  final_end_function ();
-+
-+  /* Stop pretending this is a post-reload pass.  */
-+  reload_completed = 0;
-+}
-+
-+/* Output code to add DELTA to the first argument, and then jump
-+   to FUNCTION.  Used for C++ multiple inheritance.  */
-+
-+static void
-+arm_output_mi_thunk (FILE *file, tree thunk, HOST_WIDE_INT delta,
-+		     HOST_WIDE_INT vcall_offset, tree function)
-+{
-+  if (TARGET_32BIT)
-+    arm32_output_mi_thunk (file, thunk, delta, vcall_offset, function);
-+  else
-+    arm_thumb1_mi_thunk (file, thunk, delta, vcall_offset, function);
-+}
-+
- int
- arm_emit_vector_const (FILE *file, rtx x)
- {
-@@ -27543,7 +27453,7 @@ arm_mangle_type (const_tree type)
- static const int thumb_core_reg_alloc_order[] =
- {
-    3,  2,  1,  0,  4,  5,  6,  7,
--  14, 12,  8,  9, 10, 11
-+  12, 14,  8,  9, 10, 11
- };
- 
- /* Adjust register allocation order when compiling for Thumb.  */
-@@ -27689,7 +27599,7 @@ arm_conditional_register_usage (void)
-   if (TARGET_THUMB1)
-     fixed_regs[LR_REGNUM] = call_used_regs[LR_REGNUM] = 1;
- 
--  if (TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP)
-+  if (TARGET_32BIT && TARGET_HARD_FLOAT)
-     {
-       /* VFPv3 registers are disabled when earlier VFP
- 	 versions are selected due to the definition of
-@@ -27760,7 +27670,7 @@ arm_preferred_rename_class (reg_class_t rclass)
-     return NO_REGS;
- }
- 
--/* Compute the atrribute "length" of insn "*push_multi".
-+/* Compute the attribute "length" of insn "*push_multi".
-    So this function MUST be kept in sync with that insn pattern.  */
- int
- arm_attr_length_push_multi(rtx parallel_op, rtx first_op)
-@@ -27777,6 +27687,11 @@ arm_attr_length_push_multi(rtx parallel_op, rtx first_op)
- 
-   /* Thumb2 mode.  */
-   regno = REGNO (first_op);
-+  /* For PUSH/STM under Thumb2 mode, we can use 16-bit encodings if the register
-+     list is 8-bit.  Normally this means all registers in the list must be
-+     LO_REGS, that is (R0 -R7).  If any HI_REGS used, then we must use 32-bit
-+     encodings.  There is one exception for PUSH that LR in HI_REGS can be used
-+     with 16-bit encoding.  */
-   hi_reg = (REGNO_REG_CLASS (regno) == HI_REGS) && (regno != LR_REGNUM);
-   for (i = 1; i < num_saves && !hi_reg; i++)
-     {
-@@ -27789,6 +27704,56 @@ arm_attr_length_push_multi(rtx parallel_op, rtx first_op)
-   return 4;
- }
- 
-+/* Compute the attribute "length" of insn.  Currently, this function is used
-+   for "*load_multiple_with_writeback", "*pop_multiple_with_return" and
-+   "*pop_multiple_with_writeback_and_return".  OPERANDS is the toplevel PARALLEL
-+   rtx, RETURN_PC is true if OPERANDS contains return insn.  WRITE_BACK_P is
-+   true if OPERANDS contains insn which explicit updates base register.  */
-+
-+int
-+arm_attr_length_pop_multi (rtx *operands, bool return_pc, bool write_back_p)
-+{
-+  /* ARM mode.  */
-+  if (TARGET_ARM)
-+    return 4;
-+  /* Thumb1 mode.  */
-+  if (TARGET_THUMB1)
-+    return 2;
-+
-+  rtx parallel_op = operands[0];
-+  /* Initialize to elements number of PARALLEL.  */
-+  unsigned indx = XVECLEN (parallel_op, 0) - 1;
-+  /* Initialize the value to base register.  */
-+  unsigned regno = REGNO (operands[1]);
-+  /* Skip return and write back pattern.
-+     We only need register pop pattern for later analysis.  */
-+  unsigned first_indx = 0;
-+  first_indx += return_pc ? 1 : 0;
-+  first_indx += write_back_p ? 1 : 0;
-+
-+  /* A pop operation can be done through LDM or POP.  If the base register is SP
-+     and if it's with write back, then a LDM will be alias of POP.  */
-+  bool pop_p = (regno == SP_REGNUM && write_back_p);
-+  bool ldm_p = !pop_p;
-+
-+  /* Check base register for LDM.  */
-+  if (ldm_p && REGNO_REG_CLASS (regno) == HI_REGS)
-+    return 4;
-+
-+  /* Check each register in the list.  */
-+  for (; indx >= first_indx; indx--)
-+    {
-+      regno = REGNO (XEXP (XVECEXP (parallel_op, 0, indx), 0));
-+      /* For POP, PC in HI_REGS can be used with 16-bit encoding.  See similar
-+	 comment in arm_attr_length_push_multi.  */
-+      if (REGNO_REG_CLASS (regno) == HI_REGS
-+	  && (regno != PC_REGNUM || ldm_p))
-+	return 4;
-+    }
-+
-+  return 2;
-+}
-+
- /* Compute the number of instructions emitted by output_move_double.  */
- int
- arm_count_output_move_double_insns (rtx *operands)
-@@ -27820,7 +27785,11 @@ vfp3_const_double_for_fract_bits (rtx operand)
- 	  HOST_WIDE_INT value = real_to_integer (&r0);
- 	  value = value & 0xffffffff;
- 	  if ((value != 0) && ( (value & (value - 1)) == 0))
--	    return int_log2 (value);
-+	    {
-+	      int ret = exact_log2 (value);
-+	      gcc_assert (IN_RANGE (ret, 0, 31));
-+	      return ret;
-+	    }
- 	}
-     }
-   return 0;
-@@ -27960,9 +27929,9 @@ emit_unlikely_jump (rtx insn)
- void
- arm_expand_compare_and_swap (rtx operands[])
- {
--  rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
-+  rtx bval, bdst, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
-   machine_mode mode;
--  rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
-+  rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx, rtx);
- 
-   bval = operands[0];
-   rval = operands[1];
-@@ -28019,43 +27988,54 @@ arm_expand_compare_and_swap (rtx operands[])
-       gcc_unreachable ();
-     }
- 
--  emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
-+  bdst = TARGET_THUMB1 ? bval : gen_rtx_REG (CCmode, CC_REGNUM);
-+  emit_insn (gen (bdst, rval, mem, oldval, newval, is_weak, mod_s, mod_f));
- 
-   if (mode == QImode || mode == HImode)
-     emit_move_insn (operands[1], gen_lowpart (mode, rval));
- 
-   /* In all cases, we arrange for success to be signaled by Z set.
-      This arrangement allows for the boolean result to be used directly
--     in a subsequent branch, post optimization.  */
--  x = gen_rtx_REG (CCmode, CC_REGNUM);
--  x = gen_rtx_EQ (SImode, x, const0_rtx);
--  emit_insn (gen_rtx_SET (bval, x));
-+     in a subsequent branch, post optimization.  For Thumb-1 targets, the
-+     boolean negation of the result is also stored in bval because Thumb-1
-+     backend lacks dependency tracking for CC flag due to flag-setting not
-+     being represented at RTL level.  */
-+  if (TARGET_THUMB1)
-+      emit_insn (gen_cstoresi_eq0_thumb1 (bval, bdst));
-+  else
-+    {
-+      x = gen_rtx_EQ (SImode, bdst, const0_rtx);
-+      emit_insn (gen_rtx_SET (bval, x));
-+    }
- }
- 
- /* Split a compare and swap pattern.  It is IMPLEMENTATION DEFINED whether
-    another memory store between the load-exclusive and store-exclusive can
-    reset the monitor from Exclusive to Open state.  This means we must wait
-    until after reload to split the pattern, lest we get a register spill in
--   the middle of the atomic sequence.  */
-+   the middle of the atomic sequence.  Success of the compare and swap is
-+   indicated by the Z flag set for 32bit targets and by neg_bval being zero
-+   for Thumb-1 targets (ie. negation of the boolean value returned by
-+   atomic_compare_and_swapmode standard pattern in operand 0).  */
- 
- void
- arm_split_compare_and_swap (rtx operands[])
- {
--  rtx rval, mem, oldval, newval, scratch;
-+  rtx rval, mem, oldval, newval, neg_bval;
-   machine_mode mode;
-   enum memmodel mod_s, mod_f;
-   bool is_weak;
-   rtx_code_label *label1, *label2;
-   rtx x, cond;
- 
--  rval = operands[0];
--  mem = operands[1];
--  oldval = operands[2];
--  newval = operands[3];
--  is_weak = (operands[4] != const0_rtx);
--  mod_s = memmodel_from_int (INTVAL (operands[5]));
--  mod_f = memmodel_from_int (INTVAL (operands[6]));
--  scratch = operands[7];
-+  rval = operands[1];
-+  mem = operands[2];
-+  oldval = operands[3];
-+  newval = operands[4];
-+  is_weak = (operands[5] != const0_rtx);
-+  mod_s = memmodel_from_int (INTVAL (operands[6]));
-+  mod_f = memmodel_from_int (INTVAL (operands[7]));
-+  neg_bval = TARGET_THUMB1 ? operands[0] : operands[8];
-   mode = GET_MODE (mem);
- 
-   bool is_armv8_sync = arm_arch8 && is_mm_sync (mod_s);
-@@ -28087,26 +28067,44 @@ arm_split_compare_and_swap (rtx operands[])
- 
-   arm_emit_load_exclusive (mode, rval, mem, use_acquire);
- 
--  cond = arm_gen_compare_reg (NE, rval, oldval, scratch);
--  x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
--  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
--			    gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
--  emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
-+  /* Z is set to 0 for 32bit targets (resp. rval set to 1) if oldval != rval,
-+     as required to communicate with arm_expand_compare_and_swap.  */
-+  if (TARGET_32BIT)
-+    {
-+      cond = arm_gen_compare_reg (NE, rval, oldval, neg_bval);
-+      x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
-+      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
-+				gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
-+      emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
-+    }
-+  else
-+    {
-+      emit_move_insn (neg_bval, const1_rtx);
-+      cond = gen_rtx_NE (VOIDmode, rval, oldval);
-+      if (thumb1_cmpneg_operand (oldval, SImode))
-+	emit_unlikely_jump (gen_cbranchsi4_scratch (neg_bval, rval, oldval,
-+						    label2, cond));
-+      else
-+	emit_unlikely_jump (gen_cbranchsi4_insn (cond, rval, oldval, label2));
-+    }
- 
--  arm_emit_store_exclusive (mode, scratch, mem, newval, use_release);
-+  arm_emit_store_exclusive (mode, neg_bval, mem, newval, use_release);
- 
-   /* Weak or strong, we want EQ to be true for success, so that we
-      match the flags that we got from the compare above.  */
--  cond = gen_rtx_REG (CCmode, CC_REGNUM);
--  x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
--  emit_insn (gen_rtx_SET (cond, x));
-+  if (TARGET_32BIT)
-+    {
-+      cond = gen_rtx_REG (CCmode, CC_REGNUM);
-+      x = gen_rtx_COMPARE (CCmode, neg_bval, const0_rtx);
-+      emit_insn (gen_rtx_SET (cond, x));
-+    }
- 
-   if (!is_weak)
-     {
--      x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
--      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
--				gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
--      emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
-+      /* Z is set to boolean value of !neg_bval, as required to communicate
-+	 with arm_expand_compare_and_swap.  */
-+      x = gen_rtx_NE (VOIDmode, neg_bval, const0_rtx);
-+      emit_unlikely_jump (gen_cbranchsi4 (x, neg_bval, const0_rtx, label1));
-     }
- 
-   if (!is_mm_relaxed (mod_f))
-@@ -28121,6 +28119,15 @@ arm_split_compare_and_swap (rtx operands[])
-     emit_label (label2);
- }
- 
-+/* Split an atomic operation pattern.  Operation is given by CODE and is one
-+   of PLUS, MINUS, IOR, XOR, SET (for an exchange operation) or NOT (for a nand
-+   operation).  Operation is performed on the content at MEM and on VALUE
-+   following the memory model MODEL_RTX.  The content at MEM before and after
-+   the operation is returned in OLD_OUT and NEW_OUT respectively while the
-+   success of the operation is returned in COND.  Using a scratch register or
-+   an operand register for these determines what result is returned for that
-+   pattern.  */
-+
- void
- arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
- 		     rtx value, rtx model_rtx, rtx cond)
-@@ -28129,6 +28136,7 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
-   machine_mode mode = GET_MODE (mem);
-   machine_mode wmode = (mode == DImode ? DImode : SImode);
-   rtx_code_label *label;
-+  bool all_low_regs, bind_old_new;
-   rtx x;
- 
-   bool is_armv8_sync = arm_arch8 && is_mm_sync (model);
-@@ -28163,6 +28171,28 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
- 
-   arm_emit_load_exclusive (mode, old_out, mem, use_acquire);
- 
-+  /* Does the operation require destination and first operand to use the same
-+     register?  This is decided by register constraints of relevant insn
-+     patterns in thumb1.md.  */
-+  gcc_assert (!new_out || REG_P (new_out));
-+  all_low_regs = REG_P (value) && REGNO_REG_CLASS (REGNO (value)) == LO_REGS
-+		 && new_out && REGNO_REG_CLASS (REGNO (new_out)) == LO_REGS
-+		 && REGNO_REG_CLASS (REGNO (old_out)) == LO_REGS;
-+  bind_old_new =
-+    (TARGET_THUMB1
-+     && code != SET
-+     && code != MINUS
-+     && (code != PLUS || (!all_low_regs && !satisfies_constraint_L (value))));
-+
-+  /* We want to return the old value while putting the result of the operation
-+     in the same register as the old value so copy the old value over to the
-+     destination register and use that register for the operation.  */
-+  if (old_out && bind_old_new)
-+    {
-+      emit_move_insn (new_out, old_out);
-+      old_out = new_out;
-+    }
-+
-   switch (code)
-     {
-     case SET:
-@@ -28377,6 +28407,8 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
-     case V8QImode:  gen = gen_neon_vuzpv8qi_internal;  break;
-     case V8HImode:  gen = gen_neon_vuzpv8hi_internal;  break;
-     case V4HImode:  gen = gen_neon_vuzpv4hi_internal;  break;
-+    case V8HFmode:  gen = gen_neon_vuzpv8hf_internal;  break;
-+    case V4HFmode:  gen = gen_neon_vuzpv4hf_internal;  break;
-     case V4SImode:  gen = gen_neon_vuzpv4si_internal;  break;
-     case V2SImode:  gen = gen_neon_vuzpv2si_internal;  break;
-     case V2SFmode:  gen = gen_neon_vuzpv2sf_internal;  break;
-@@ -28450,6 +28482,8 @@ arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
-     case V8QImode:  gen = gen_neon_vzipv8qi_internal;  break;
-     case V8HImode:  gen = gen_neon_vzipv8hi_internal;  break;
-     case V4HImode:  gen = gen_neon_vzipv4hi_internal;  break;
-+    case V8HFmode:  gen = gen_neon_vzipv8hf_internal;  break;
-+    case V4HFmode:  gen = gen_neon_vzipv4hf_internal;  break;
-     case V4SImode:  gen = gen_neon_vzipv4si_internal;  break;
-     case V2SImode:  gen = gen_neon_vzipv2si_internal;  break;
-     case V2SFmode:  gen = gen_neon_vzipv2sf_internal;  break;
-@@ -28502,6 +28536,8 @@ arm_evpc_neon_vrev (struct expand_vec_perm_d *d)
- 	case V8QImode:  gen = gen_neon_vrev32v8qi;  break;
- 	case V8HImode:  gen = gen_neon_vrev64v8hi;  break;
- 	case V4HImode:  gen = gen_neon_vrev64v4hi;  break;
-+	case V8HFmode:  gen = gen_neon_vrev64v8hf;  break;
-+	case V4HFmode:  gen = gen_neon_vrev64v4hf;  break;
- 	default:
- 	  return false;
- 	}
-@@ -28585,6 +28621,8 @@ arm_evpc_neon_vtrn (struct expand_vec_perm_d *d)
-     case V8QImode:  gen = gen_neon_vtrnv8qi_internal;  break;
-     case V8HImode:  gen = gen_neon_vtrnv8hi_internal;  break;
-     case V4HImode:  gen = gen_neon_vtrnv4hi_internal;  break;
-+    case V8HFmode:  gen = gen_neon_vtrnv8hf_internal;  break;
-+    case V4HFmode:  gen = gen_neon_vtrnv4hf_internal;  break;
-     case V4SImode:  gen = gen_neon_vtrnv4si_internal;  break;
-     case V2SImode:  gen = gen_neon_vtrnv2si_internal;  break;
-     case V2SFmode:  gen = gen_neon_vtrnv2sf_internal;  break;
-@@ -28660,6 +28698,8 @@ arm_evpc_neon_vext (struct expand_vec_perm_d *d)
-     case V8HImode: gen = gen_neon_vextv8hi; break;
-     case V2SImode: gen = gen_neon_vextv2si; break;
-     case V4SImode: gen = gen_neon_vextv4si; break;
-+    case V4HFmode: gen = gen_neon_vextv4hf; break;
-+    case V8HFmode: gen = gen_neon_vextv8hf; break;
-     case V2SFmode: gen = gen_neon_vextv2sf; break;
-     case V4SFmode: gen = gen_neon_vextv4sf; break;
-     case V2DImode: gen = gen_neon_vextv2di; break;
-@@ -29185,7 +29225,7 @@ arm_validize_comparison (rtx *comparison, rtx * op1, rtx * op2)
- {
-   enum rtx_code code = GET_CODE (*comparison);
-   int code_int;
--  machine_mode mode = (GET_MODE (*op1) == VOIDmode) 
-+  machine_mode mode = (GET_MODE (*op1) == VOIDmode)
-     ? GET_MODE (*op2) : GET_MODE (*op1);
- 
-   gcc_assert (GET_MODE (*op1) != VOIDmode || GET_MODE (*op2) != VOIDmode);
-@@ -29213,11 +29253,19 @@ arm_validize_comparison (rtx *comparison, rtx * op1, rtx * op2)
- 	*op2 = force_reg (mode, *op2);
-       return true;
- 
-+    case HFmode:
-+      if (!TARGET_VFP_FP16INST)
-+	break;
-+      /* FP16 comparisons are done in SF mode.  */
-+      mode = SFmode;
-+      *op1 = convert_to_mode (mode, *op1, 1);
-+      *op2 = convert_to_mode (mode, *op2, 1);
-+      /* Fall through.  */
-     case SFmode:
-     case DFmode:
--      if (!arm_float_compare_operand (*op1, mode))
-+      if (!vfp_compare_operand (*op1, mode))
- 	*op1 = force_reg (mode, *op1);
--      if (!arm_float_compare_operand (*op2, mode))
-+      if (!vfp_compare_operand (*op2, mode))
- 	*op2 = force_reg (mode, *op2);
-       return true;
-     default:
-@@ -29759,11 +29807,57 @@ arm_macro_fusion_p (void)
-   return current_tune->fusible_ops != tune_params::FUSE_NOTHING;
- }
- 
-+/* Return true if the two back-to-back sets PREV_SET, CURR_SET are suitable
-+   for MOVW / MOVT macro fusion.  */
-+
-+static bool
-+arm_sets_movw_movt_fusible_p (rtx prev_set, rtx curr_set)
-+{
-+  /* We are trying to fuse
-+     movw imm / movt imm
-+    instructions as a group that gets scheduled together.  */
-+
-+  rtx set_dest = SET_DEST (curr_set);
-+
-+  if (GET_MODE (set_dest) != SImode)
-+    return false;
-+
-+  /* We are trying to match:
-+     prev (movw)  == (set (reg r0) (const_int imm16))
-+     curr (movt) == (set (zero_extract (reg r0)
-+					(const_int 16)
-+					(const_int 16))
-+			  (const_int imm16_1))
-+     or
-+     prev (movw) == (set (reg r1)
-+			  (high (symbol_ref ("SYM"))))
-+    curr (movt) == (set (reg r0)
-+			(lo_sum (reg r1)
-+				(symbol_ref ("SYM"))))  */
-+
-+    if (GET_CODE (set_dest) == ZERO_EXTRACT)
-+      {
-+	if (CONST_INT_P (SET_SRC (curr_set))
-+	    && CONST_INT_P (SET_SRC (prev_set))
-+	    && REG_P (XEXP (set_dest, 0))
-+	    && REG_P (SET_DEST (prev_set))
-+	    && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
-+	  return true;
-+
-+      }
-+    else if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
-+	     && REG_P (SET_DEST (curr_set))
-+	     && REG_P (SET_DEST (prev_set))
-+	     && GET_CODE (SET_SRC (prev_set)) == HIGH
-+	     && REGNO (SET_DEST (curr_set)) == REGNO (SET_DEST (prev_set)))
-+      return true;
-+
-+  return false;
-+}
- 
- static bool
- aarch_macro_fusion_pair_p (rtx_insn* prev, rtx_insn* curr)
- {
--  rtx set_dest;
-   rtx prev_set = single_set (prev);
-   rtx curr_set = single_set (curr);
- 
-@@ -29781,54 +29875,26 @@ aarch_macro_fusion_pair_p (rtx_insn* prev, rtx_insn* curr)
-       && aarch_crypto_can_dual_issue (prev, curr))
-     return true;
- 
--  if (current_tune->fusible_ops & tune_params::FUSE_MOVW_MOVT)
--    {
--      /* We are trying to fuse
--	 movw imm / movt imm
--	 instructions as a group that gets scheduled together.  */
--
--      set_dest = SET_DEST (curr_set);
--
--      if (GET_MODE (set_dest) != SImode)
--	return false;
-+  if (current_tune->fusible_ops & tune_params::FUSE_MOVW_MOVT
-+      && arm_sets_movw_movt_fusible_p (prev_set, curr_set))
-+    return true;
- 
--      /* We are trying to match:
--	 prev (movw)  == (set (reg r0) (const_int imm16))
--	 curr (movt) == (set (zero_extract (reg r0)
--					  (const_int 16)
--					   (const_int 16))
--			     (const_int imm16_1))
--	 or
--	 prev (movw) == (set (reg r1)
--			      (high (symbol_ref ("SYM"))))
--	 curr (movt) == (set (reg r0)
--			     (lo_sum (reg r1)
--				     (symbol_ref ("SYM"))))  */
--      if (GET_CODE (set_dest) == ZERO_EXTRACT)
--	{
--	  if (CONST_INT_P (SET_SRC (curr_set))
--	      && CONST_INT_P (SET_SRC (prev_set))
--	      && REG_P (XEXP (set_dest, 0))
--	      && REG_P (SET_DEST (prev_set))
--	      && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
--	    return true;
--	}
--      else if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
--	       && REG_P (SET_DEST (curr_set))
--	       && REG_P (SET_DEST (prev_set))
--	       && GET_CODE (SET_SRC (prev_set)) == HIGH
--	       && REGNO (SET_DEST (curr_set)) == REGNO (SET_DEST (prev_set)))
--	     return true;
--    }
-   return false;
- }
- 
-+/* Return true iff the instruction fusion described by OP is enabled.  */
-+bool
-+arm_fusion_enabled_p (tune_params::fuse_ops op)
-+{
-+  return current_tune->fusible_ops & op;
-+}
-+
- /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
- 
- static unsigned HOST_WIDE_INT
- arm_asan_shadow_offset (void)
- {
--  return (unsigned HOST_WIDE_INT) 1 << 29;
-+  return HOST_WIDE_INT_1U << 29;
- }
- 
- 
-@@ -29853,9 +29919,9 @@ arm_const_not_ok_for_debug_p (rtx p)
- 	      && GET_CODE (XEXP (p, 0)) == SYMBOL_REF
- 	      && (decl_op0 = SYMBOL_REF_DECL (XEXP (p, 0))))
- 	    {
--	      if ((TREE_CODE (decl_op1) == VAR_DECL
-+	      if ((VAR_P (decl_op1)
- 		   || TREE_CODE (decl_op1) == CONST_DECL)
--		  && (TREE_CODE (decl_op0) == VAR_DECL
-+		  && (VAR_P (decl_op0)
- 		      || TREE_CODE (decl_op0) == CONST_DECL))
- 		return (get_variable_section (decl_op1, false)
- 			!= get_variable_section (decl_op0, false));
-@@ -29988,9 +30054,8 @@ arm_can_inline_p (tree caller, tree callee)
-   if ((caller_fpu->features & callee_fpu->features) != callee_fpu->features)
-     return false;
- 
--  /* Need same model and regs.  */
--  if (callee_fpu->model != caller_fpu->model
--      || callee_fpu->regs != callee_fpu->regs)
-+  /* Need same FPU regs.  */
-+  if (callee_fpu->regs != callee_fpu->regs)
-     return false;
- 
-   /* OK to inline between different modes.
-@@ -30333,4 +30398,113 @@ arm_sched_fusion_priority (rtx_insn *insn, int max_pri,
-   return;
- }
- 
-+
-+/* Construct and return a PARALLEL RTX vector with elements numbering the
-+   lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
-+   the vector - from the perspective of the architecture.  This does not
-+   line up with GCC's perspective on lane numbers, so we end up with
-+   different masks depending on our target endian-ness.  The diagram
-+   below may help.  We must draw the distinction when building masks
-+   which select one half of the vector.  An instruction selecting
-+   architectural low-lanes for a big-endian target, must be described using
-+   a mask selecting GCC high-lanes.
-+
-+                 Big-Endian             Little-Endian
-+
-+GCC             0   1   2   3           3   2   1   0
-+              | x | x | x | x |       | x | x | x | x |
-+Architecture    3   2   1   0           3   2   1   0
-+
-+Low Mask:         { 2, 3 }                { 0, 1 }
-+High Mask:        { 0, 1 }                { 2, 3 }
-+*/
-+
-+rtx
-+arm_simd_vect_par_cnst_half (machine_mode mode, bool high)
-+{
-+  int nunits = GET_MODE_NUNITS (mode);
-+  rtvec v = rtvec_alloc (nunits / 2);
-+  int high_base = nunits / 2;
-+  int low_base = 0;
-+  int base;
-+  rtx t1;
-+  int i;
-+
-+  if (BYTES_BIG_ENDIAN)
-+    base = high ? low_base : high_base;
-+  else
-+    base = high ? high_base : low_base;
-+
-+  for (i = 0; i < nunits / 2; i++)
-+    RTVEC_ELT (v, i) = GEN_INT (base + i);
-+
-+  t1 = gen_rtx_PARALLEL (mode, v);
-+  return t1;
-+}
-+
-+/* Check OP for validity as a PARALLEL RTX vector with elements
-+   numbering the lanes of either the high (HIGH == TRUE) or low lanes,
-+   from the perspective of the architecture.  See the diagram above
-+   arm_simd_vect_par_cnst_half_p for more details.  */
-+
-+bool
-+arm_simd_check_vect_par_cnst_half_p (rtx op, machine_mode mode,
-+				       bool high)
-+{
-+  rtx ideal = arm_simd_vect_par_cnst_half (mode, high);
-+  HOST_WIDE_INT count_op = XVECLEN (op, 0);
-+  HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
-+  int i = 0;
-+
-+  if (!VECTOR_MODE_P (mode))
-+    return false;
-+
-+  if (count_op != count_ideal)
-+    return false;
-+
-+  for (i = 0; i < count_ideal; i++)
-+    {
-+      rtx elt_op = XVECEXP (op, 0, i);
-+      rtx elt_ideal = XVECEXP (ideal, 0, i);
-+
-+      if (!CONST_INT_P (elt_op)
-+	  || INTVAL (elt_ideal) != INTVAL (elt_op))
-+	return false;
-+    }
-+  return true;
-+}
-+
-+/* Can output mi_thunk for all cases except for non-zero vcall_offset
-+   in Thumb1.  */
-+static bool
-+arm_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
-+			 const_tree)
-+{
-+  /* For now, we punt and not handle this for TARGET_THUMB1.  */
-+  if (vcall_offset && TARGET_THUMB1)
-+    return false;
-+
-+  /* Otherwise ok.  */
-+  return true;
-+}
-+
-+/* Generate RTL for a conditional branch with rtx comparison CODE in
-+   mode CC_MODE. The destination of the unlikely conditional branch
-+   is LABEL_REF.  */
-+
-+void
-+arm_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
-+			  rtx label_ref)
-+{
-+  rtx x;
-+  x = gen_rtx_fmt_ee (code, VOIDmode,
-+		      gen_rtx_REG (cc_mode, CC_REGNUM),
-+		      const0_rtx);
-+
-+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
-+			    gen_rtx_LABEL_REF (VOIDmode, label_ref),
-+			    pc_rtx);
-+  emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
-+}
-+
- #include "gt-arm.h"
---- a/src/gcc/config/arm/arm.h
-+++ b/src/gcc/config/arm/arm.h
-@@ -80,11 +80,6 @@ extern arm_cc arm_current_cc;
- extern int arm_target_label;
- extern int arm_ccfsm_state;
- extern GTY(()) rtx arm_target_insn;
--/* The label of the current constant pool.  */
--extern rtx pool_vector_label;
--/* Set to 1 when a return insn is output, this means that the epilogue
--   is not needed.  */
--extern int return_used_this_function;
- /* Callback to output language specific object attributes.  */
- extern void (*arm_lang_output_object_attributes_hook)(void);
- 
-@@ -139,7 +134,6 @@ extern void (*arm_lang_output_object_attributes_hook)(void);
- #define TARGET_HARD_FLOAT		(arm_float_abi != ARM_FLOAT_ABI_SOFT)
- /* Use hardware floating point calling convention.  */
- #define TARGET_HARD_FLOAT_ABI		(arm_float_abi == ARM_FLOAT_ABI_HARD)
--#define TARGET_VFP		        (TARGET_FPU_MODEL == ARM_FP_MODEL_VFP)
- #define TARGET_IWMMXT			(arm_arch_iwmmxt)
- #define TARGET_IWMMXT2			(arm_arch_iwmmxt2)
- #define TARGET_REALLY_IWMMXT		(TARGET_IWMMXT && TARGET_32BIT)
-@@ -177,50 +171,57 @@ extern void (*arm_lang_output_object_attributes_hook)(void);
-    to be more careful with TARGET_NEON as noted below.  */
- 
- /* FPU is has the full VFPv3/NEON register file of 32 D registers.  */
--#define TARGET_VFPD32 (TARGET_VFP && TARGET_FPU_REGS == VFP_REG_D32)
-+#define TARGET_VFPD32 (TARGET_FPU_REGS == VFP_REG_D32)
- 
- /* FPU supports VFPv3 instructions.  */
--#define TARGET_VFP3 (TARGET_VFP && TARGET_FPU_REV >= 3)
-+#define TARGET_VFP3 (TARGET_FPU_REV >= 3)
- 
- /* FPU supports FPv5 instructions.  */
--#define TARGET_VFP5 (TARGET_VFP && TARGET_FPU_REV >= 5)
-+#define TARGET_VFP5 (TARGET_FPU_REV >= 5)
- 
- /* FPU only supports VFP single-precision instructions.  */
--#define TARGET_VFP_SINGLE (TARGET_VFP && TARGET_FPU_REGS == VFP_REG_SINGLE)
-+#define TARGET_VFP_SINGLE (TARGET_FPU_REGS == VFP_REG_SINGLE)
- 
- /* FPU supports VFP double-precision instructions.  */
--#define TARGET_VFP_DOUBLE (TARGET_VFP && TARGET_FPU_REGS != VFP_REG_SINGLE)
-+#define TARGET_VFP_DOUBLE (TARGET_FPU_REGS != VFP_REG_SINGLE)
- 
- /* FPU supports half-precision floating-point with NEON element load/store.  */
--#define TARGET_NEON_FP16						\
--  (TARGET_VFP								\
--   && ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_NEON | FPU_FL_FP16))
-+#define TARGET_NEON_FP16					\
-+  (ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_NEON)		\
-+   && ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_FP16))
- 
- /* FPU supports VFP half-precision floating-point.  */
- #define TARGET_FP16							\
--  (TARGET_VFP && ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_FP16))
-+  (ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_FP16))
- 
- /* FPU supports fused-multiply-add operations.  */
--#define TARGET_FMA (TARGET_VFP && TARGET_FPU_REV >= 4)
-+#define TARGET_FMA (TARGET_FPU_REV >= 4)
- 
- /* FPU is ARMv8 compatible.  */
--#define TARGET_FPU_ARMV8 (TARGET_VFP && TARGET_FPU_REV >= 8)
-+#define TARGET_FPU_ARMV8 (TARGET_FPU_REV >= 8)
- 
- /* FPU supports Crypto extensions.  */
- #define TARGET_CRYPTO							\
--  (TARGET_VFP && ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_CRYPTO))
-+  (ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_CRYPTO))
- 
- /* FPU supports Neon instructions.  The setting of this macro gets
-    revealed via __ARM_NEON__ so we add extra guards upon TARGET_32BIT
-    and TARGET_HARD_FLOAT to ensure that NEON instructions are
-    available.  */
- #define TARGET_NEON							\
--  (TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP			\
-+  (TARGET_32BIT && TARGET_HARD_FLOAT					\
-    && ARM_FPU_FSET_HAS (TARGET_FPU_FEATURES, FPU_FL_NEON))
- 
- /* FPU supports ARMv8.1 Adv.SIMD extensions.  */
- #define TARGET_NEON_RDMA (TARGET_NEON && arm_arch8_1)
- 
-+/* FPU supports the floating point FP16 instructions for ARMv8.2 and later.  */
-+#define TARGET_VFP_FP16INST \
-+  (TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPU_ARMV8 && arm_fp16_inst)
-+
-+/* FPU supports the AdvSIMD FP16 instructions for ARMv8.2 and later.  */
-+#define TARGET_NEON_FP16INST (TARGET_VFP_FP16INST && TARGET_NEON_RDMA)
-+
- /* Q-bit is present.  */
- #define TARGET_ARM_QBIT \
-   (TARGET_32BIT && arm_arch5e && (arm_arch_notm || arm_arch7))
-@@ -236,7 +237,7 @@ extern void (*arm_lang_output_object_attributes_hook)(void);
- 
- /* Should MOVW/MOVT be used in preference to a constant pool.  */
- #define TARGET_USE_MOVT \
--  (arm_arch_thumb2 \
-+  (TARGET_HAVE_MOVT \
-    && (arm_disable_literal_pool \
-        || (!optimize_size && !current_tune->prefer_constant_pool)))
- 
-@@ -251,14 +252,18 @@ extern void (*arm_lang_output_object_attributes_hook)(void);
- #define TARGET_HAVE_MEMORY_BARRIER (TARGET_HAVE_DMB || TARGET_HAVE_DMB_MCR)
- 
- /* Nonzero if this chip supports ldrex and strex */
--#define TARGET_HAVE_LDREX        ((arm_arch6 && TARGET_ARM) || arm_arch7)
-+#define TARGET_HAVE_LDREX        ((arm_arch6 && TARGET_ARM)	\
-+				  || arm_arch7			\
-+				  || (arm_arch8 && !arm_arch_notm))
- 
- /* Nonzero if this chip supports LPAE.  */
- #define TARGET_HAVE_LPAE						\
-   (arm_arch7 && ARM_FSET_HAS_CPU1 (insn_flags, FL_FOR_ARCH7VE))
- 
- /* Nonzero if this chip supports ldrex{bh} and strex{bh}.  */
--#define TARGET_HAVE_LDREXBH ((arm_arch6k && TARGET_ARM) || arm_arch7)
-+#define TARGET_HAVE_LDREXBH ((arm_arch6k && TARGET_ARM)		\
-+			     || arm_arch7			\
-+			     || (arm_arch8 && !arm_arch_notm))
- 
- /* Nonzero if this chip supports ldrexd and strexd.  */
- #define TARGET_HAVE_LDREXD (((arm_arch6k && TARGET_ARM) \
-@@ -267,9 +272,20 @@ extern void (*arm_lang_output_object_attributes_hook)(void);
- /* Nonzero if this chip supports load-acquire and store-release.  */
- #define TARGET_HAVE_LDACQ	(TARGET_ARM_ARCH >= 8)
- 
-+/* Nonzero if this chip supports LDAEXD and STLEXD.  */
-+#define TARGET_HAVE_LDACQEXD	(TARGET_ARM_ARCH >= 8	\
-+				 && TARGET_32BIT	\
-+				 && arm_arch_notm)
-+
-+/* Nonzero if this chip provides the MOVW and MOVT instructions.  */
-+#define TARGET_HAVE_MOVT	(arm_arch_thumb2 || arm_arch8)
-+
-+/* Nonzero if this chip provides the CBZ and CBNZ instructions.  */
-+#define TARGET_HAVE_CBZ		(arm_arch_thumb2 || arm_arch8)
-+
- /* Nonzero if integer division instructions supported.  */
- #define TARGET_IDIV	((TARGET_ARM && arm_arch_arm_hwdiv)	\
--			 || (TARGET_THUMB2 && arm_arch_thumb_hwdiv))
-+			 || (TARGET_THUMB && arm_arch_thumb_hwdiv))
- 
- /* Nonzero if disallow volatile memory access in IT block.  */
- #define TARGET_NO_VOLATILE_CE		(arm_arch_no_volatile_ce)
-@@ -349,7 +365,6 @@ enum vfp_reg_type
- extern const struct arm_fpu_desc
- {
-   const char *name;
--  enum arm_fp_model model;
-   int rev;
-   enum vfp_reg_type regs;
-   arm_fpu_feature_set features;
-@@ -358,7 +373,6 @@ extern const struct arm_fpu_desc
- /* Accessors.  */
- 
- #define TARGET_FPU_NAME     (all_fpus[arm_fpu_index].name)
--#define TARGET_FPU_MODEL    (all_fpus[arm_fpu_index].model)
- #define TARGET_FPU_REV      (all_fpus[arm_fpu_index].rev)
- #define TARGET_FPU_REGS     (all_fpus[arm_fpu_index].regs)
- #define TARGET_FPU_FEATURES (all_fpus[arm_fpu_index].features)
-@@ -402,7 +416,9 @@ enum base_architecture
-   BASE_ARCH_7R = 7,
-   BASE_ARCH_7M = 7,
-   BASE_ARCH_7EM = 7,
--  BASE_ARCH_8A = 8
-+  BASE_ARCH_8A = 8,
-+  BASE_ARCH_8M_BASE = 8,
-+  BASE_ARCH_8M_MAIN = 8
- };
- 
- /* The major revision number of the ARM Architecture implemented by the target.  */
-@@ -447,6 +463,13 @@ extern int arm_arch8;
- /* Nonzero if this chip supports the ARM Architecture 8.1 extensions.  */
- extern int arm_arch8_1;
- 
-+/* Nonzero if this chip supports the ARM Architecture 8.2 extensions.  */
-+extern int arm_arch8_2;
-+
-+/* Nonzero if this chip supports the FP16 instructions extension of ARM
-+   Architecture 8.2.  */
-+extern int arm_fp16_inst;
-+
- /* Nonzero if this chip can benefit from load scheduling.  */
- extern int arm_ld_sched;
- 
-@@ -478,6 +501,9 @@ extern int arm_tune_cortex_a9;
-    interworking clean.  */
- extern int arm_cpp_interwork;
- 
-+/* Nonzero if chip supports Thumb 1.  */
-+extern int arm_arch_thumb1;
-+
- /* Nonzero if chip supports Thumb 2.  */
- extern int arm_arch_thumb2;
- 
-@@ -502,6 +528,9 @@ extern bool arm_disable_literal_pool;
- /* Nonzero if chip supports the ARMv8 CRC instructions.  */
- extern int arm_arch_crc;
- 
-+/* Nonzero if chip supports the ARMv8-M Security Extensions.  */
-+extern int arm_arch_cmse;
-+
- #ifndef TARGET_DEFAULT
- #define TARGET_DEFAULT  (MASK_APCS_FRAME)
- #endif
-@@ -1191,7 +1220,7 @@ enum reg_class
-    the data layout happens to be consistent for big-endian, so we explicitly allow
-    that case.  */
- #define CANNOT_CHANGE_MODE_CLASS(FROM, TO, CLASS)		\
--  (TARGET_VFP && TARGET_BIG_END					\
-+  (TARGET_BIG_END						\
-    && !(GET_MODE_SIZE (FROM) == 16 && GET_MODE_SIZE (TO) == 8)	\
-    && (GET_MODE_SIZE (FROM) > UNITS_PER_WORD			\
-        || GET_MODE_SIZE (TO) > UNITS_PER_WORD)			\
-@@ -1242,8 +1271,7 @@ enum reg_class
-    NO_REGS is returned.  */
- #define SECONDARY_OUTPUT_RELOAD_CLASS(CLASS, MODE, X)		\
-   /* Restrict which direct reloads are allowed for VFP/iWMMXt regs.  */ \
--  ((TARGET_VFP && TARGET_HARD_FLOAT				\
--    && IS_VFP_CLASS (CLASS))					\
-+  ((TARGET_HARD_FLOAT && IS_VFP_CLASS (CLASS))			\
-    ? coproc_secondary_reload_class (MODE, X, FALSE)		\
-    : (TARGET_IWMMXT && (CLASS) == IWMMXT_REGS)			\
-    ? coproc_secondary_reload_class (MODE, X, TRUE)		\
-@@ -1255,8 +1283,7 @@ enum reg_class
- /* If we need to load shorts byte-at-a-time, then we need a scratch.  */
- #define SECONDARY_INPUT_RELOAD_CLASS(CLASS, MODE, X)		\
-   /* Restrict which direct reloads are allowed for VFP/iWMMXt regs.  */ \
--  ((TARGET_VFP && TARGET_HARD_FLOAT				\
--    && IS_VFP_CLASS (CLASS))					\
-+  ((TARGET_HARD_FLOAT && IS_VFP_CLASS (CLASS))			\
-     ? coproc_secondary_reload_class (MODE, X, FALSE) :		\
-     (TARGET_IWMMXT && (CLASS) == IWMMXT_REGS) ?			\
-     coproc_secondary_reload_class (MODE, X, TRUE) :		\
-@@ -1363,6 +1390,7 @@ enum reg_class
- #define ARM_FT_VOLATILE		(1 << 4) /* Does not return.  */
- #define ARM_FT_NESTED		(1 << 5) /* Embedded inside another func.  */
- #define ARM_FT_STACKALIGN	(1 << 6) /* Called with misaligned stack.  */
-+#define ARM_FT_CMSE_ENTRY	(1 << 7) /* ARMv8-M non-secure entry function.  */
- 
- /* Some macros to test these flags.  */
- #define ARM_FUNC_TYPE(t)	(t & ARM_FT_TYPE_MASK)
-@@ -1371,6 +1399,7 @@ enum reg_class
- #define IS_NAKED(t)        	(t & ARM_FT_NAKED)
- #define IS_NESTED(t)       	(t & ARM_FT_NESTED)
- #define IS_STACKALIGN(t)       	(t & ARM_FT_STACKALIGN)
-+#define IS_CMSE_ENTRY(t)	(t & ARM_FT_CMSE_ENTRY)
- 
- 
- /* Structure used to hold the function stack frame layout.  Offsets are
-@@ -1516,7 +1545,7 @@ typedef struct
-    On the ARM, r0-r3 are used to pass args.  */
- #define FUNCTION_ARG_REGNO_P(REGNO)					\
-    (IN_RANGE ((REGNO), 0, 3)						\
--    || (TARGET_AAPCS_BASED && TARGET_VFP && TARGET_HARD_FLOAT		\
-+    || (TARGET_AAPCS_BASED && TARGET_HARD_FLOAT				\
- 	&& IN_RANGE ((REGNO), FIRST_VFP_REGNUM, FIRST_VFP_REGNUM + 15))	\
-     || (TARGET_IWMMXT_ABI						\
- 	&& IN_RANGE ((REGNO), FIRST_IWMMXT_REGNUM, FIRST_IWMMXT_REGNUM + 9)))
-@@ -2187,13 +2216,9 @@ extern int making_const_table;
- #define TARGET_ARM_ARCH	\
-   (arm_base_arch)	\
- 
--#define TARGET_ARM_V6M (!arm_arch_notm && !arm_arch_thumb2)
--#define TARGET_ARM_V7M (!arm_arch_notm && arm_arch_thumb2)
--
- /* The highest Thumb instruction set version supported by the chip.  */
--#define TARGET_ARM_ARCH_ISA_THUMB 		\
--  (arm_arch_thumb2 ? 2				\
--	           : ((TARGET_ARM_ARCH >= 5 || arm_arch4t) ? 1 : 0))
-+#define TARGET_ARM_ARCH_ISA_THUMB		\
-+  (arm_arch_thumb2 ? 2 : (arm_arch_thumb1 ? 1 : 0))
- 
- /* Expands to an upper-case char of the target's architectural
-    profile.  */
-@@ -2245,13 +2270,18 @@ extern const char *arm_rewrite_mcpu (int argc, const char **argv);
-    "   :%{march=*:-march=%*}}"					\
-    BIG_LITTLE_SPEC
- 
-+extern const char *arm_target_thumb_only (int argc, const char **argv);
-+#define TARGET_MODE_SPEC_FUNCTIONS					\
-+  { "target_mode_check", arm_target_thumb_only },
-+
- /* -mcpu=native handling only makes sense with compiler running on
-    an ARM chip.  */
- #if defined(__arm__)
- extern const char *host_detect_local_cpu (int argc, const char **argv);
- # define EXTRA_SPEC_FUNCTIONS						\
-   { "local_cpu_detect", host_detect_local_cpu },			\
--  BIG_LITTLE_CPU_SPEC_FUNCTIONS
-+  BIG_LITTLE_CPU_SPEC_FUNCTIONS						\
-+  TARGET_MODE_SPEC_FUNCTIONS
- 
- # define MCPU_MTUNE_NATIVE_SPECS					\
-    " %{march=native:%<march=native %:local_cpu_detect(arch)}"		\
-@@ -2259,10 +2289,21 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
-    " %{mtune=native:%<mtune=native %:local_cpu_detect(tune)}"
- #else
- # define MCPU_MTUNE_NATIVE_SPECS ""
--# define EXTRA_SPEC_FUNCTIONS BIG_LITTLE_CPU_SPEC_FUNCTIONS
-+# define EXTRA_SPEC_FUNCTIONS						\
-+	BIG_LITTLE_CPU_SPEC_FUNCTIONS					\
-+	TARGET_MODE_SPEC_FUNCTIONS
- #endif
- 
--#define DRIVER_SELF_SPECS MCPU_MTUNE_NATIVE_SPECS
-+/* Automatically add -mthumb for Thumb-only targets if mode isn't specified
-+   via the configuration option --with-mode or via the command line. The
-+   function target_mode_check is called to do the check with either:
-+   - an array of -march values if any is given;
-+   - an array of -mcpu values if any is given;
-+   - an empty array.  */
-+#define TARGET_MODE_SPECS						\
-+  " %{!marm:%{!mthumb:%:target_mode_check(%{march=*:%*;mcpu=*:%*;:})}}"
-+
-+#define DRIVER_SELF_SPECS MCPU_MTUNE_NATIVE_SPECS TARGET_MODE_SPECS
- #define TARGET_SUPPORTS_WIDE_INT 1
- 
- /* For switching between functions with different target attributes.  */
---- a/src/gcc/config/arm/arm.md
-+++ b/src/gcc/config/arm/arm.md
-@@ -118,10 +118,10 @@
- ; This can be "a" for ARM, "t" for either of the Thumbs, "32" for
- ; TARGET_32BIT, "t1" or "t2" to specify a specific Thumb mode.  "v6"
- ; for ARM or Thumb-2 with arm_arch6, and nov6 for ARM without
--; arm_arch6.  "v6t2" for Thumb-2 with arm_arch6.  This attribute is
--; used to compute attribute "enabled", use type "any" to enable an
--; alternative in all cases.
--(define_attr "arch" "any,a,t,32,t1,t2,v6,nov6,v6t2,neon_for_64bits,avoid_neon_for_64bits,iwmmxt,iwmmxt2,armv6_or_vfpv3"
-+; arm_arch6.  "v6t2" for Thumb-2 with arm_arch6 and "v8mb" for ARMv8-M
-+; Baseline.  This attribute is used to compute attribute "enabled",
-+; use type "any" to enable an alternative in all cases.
-+(define_attr "arch" "any,a,t,32,t1,t2,v6,nov6,v6t2,v8mb,neon_for_64bits,avoid_neon_for_64bits,iwmmxt,iwmmxt2,armv6_or_vfpv3,neon"
-   (const_string "any"))
- 
- (define_attr "arch_enabled" "no,yes"
-@@ -160,6 +160,10 @@
- 	      (match_test "TARGET_32BIT && arm_arch6 && arm_arch_thumb2"))
- 	 (const_string "yes")
- 
-+	 (and (eq_attr "arch" "v8mb")
-+	      (match_test "TARGET_THUMB1 && arm_arch8"))
-+	 (const_string "yes")
-+
- 	 (and (eq_attr "arch" "avoid_neon_for_64bits")
- 	      (match_test "TARGET_NEON")
- 	      (not (match_test "TARGET_PREFER_NEON_64BITS")))
-@@ -177,6 +181,10 @@
- 	 (and (eq_attr "arch" "armv6_or_vfpv3")
- 	      (match_test "arm_arch6 || TARGET_VFP3"))
- 	 (const_string "yes")
-+
-+	 (and (eq_attr "arch" "neon")
-+	      (match_test "TARGET_NEON"))
-+	 (const_string "yes")
- 	]
- 
- 	(const_string "no")))
-@@ -539,6 +547,32 @@
-    (set_attr "type" "multiple")]
- )
- 
-+(define_expand "addv<mode>4"
-+  [(match_operand:SIDI 0 "register_operand")
-+   (match_operand:SIDI 1 "register_operand")
-+   (match_operand:SIDI 2 "register_operand")
-+   (match_operand 3 "")]
-+  "TARGET_32BIT"
-+{
-+  emit_insn (gen_add<mode>3_compareV (operands[0], operands[1], operands[2]));
-+  arm_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
-+
-+  DONE;
-+})
-+
-+(define_expand "uaddv<mode>4"
-+  [(match_operand:SIDI 0 "register_operand")
-+   (match_operand:SIDI 1 "register_operand")
-+   (match_operand:SIDI 2 "register_operand")
-+   (match_operand 3 "")]
-+  "TARGET_32BIT"
-+{
-+  emit_insn (gen_add<mode>3_compareC (operands[0], operands[1], operands[2]));
-+  arm_gen_unlikely_cbranch (NE, CC_Cmode, operands[3]);
-+
-+  DONE;
-+})
-+
- (define_expand "addsi3"
-   [(set (match_operand:SI          0 "s_register_operand" "")
- 	(plus:SI (match_operand:SI 1 "s_register_operand" "")
-@@ -617,6 +651,165 @@
-  ]
- )
- 
-+(define_insn_and_split "adddi3_compareV"
-+  [(set (reg:CC_V CC_REGNUM)
-+	(ne:CC_V
-+	  (plus:TI
-+	    (sign_extend:TI (match_operand:DI 1 "register_operand" "r"))
-+	    (sign_extend:TI (match_operand:DI 2 "register_operand" "r")))
-+	  (sign_extend:TI (plus:DI (match_dup 1) (match_dup 2)))))
-+   (set (match_operand:DI 0 "register_operand" "=&r")
-+	(plus:DI (match_dup 1) (match_dup 2)))]
-+  "TARGET_32BIT"
-+  "#"
-+  "&& reload_completed"
-+  [(parallel [(set (reg:CC_C CC_REGNUM)
-+		   (compare:CC_C (plus:SI (match_dup 1) (match_dup 2))
-+				 (match_dup 1)))
-+	      (set (match_dup 0) (plus:SI (match_dup 1) (match_dup 2)))])
-+   (parallel [(set (reg:CC_V CC_REGNUM)
-+		   (ne:CC_V
-+		    (plus:DI (plus:DI
-+			      (sign_extend:DI (match_dup 4))
-+			      (sign_extend:DI (match_dup 5)))
-+			     (ltu:DI (reg:CC_C CC_REGNUM) (const_int 0)))
-+		    (plus:DI (sign_extend:DI
-+			      (plus:SI (match_dup 4) (match_dup 5)))
-+			     (ltu:DI (reg:CC_C CC_REGNUM) (const_int 0)))))
-+	     (set (match_dup 3) (plus:SI (plus:SI
-+					  (match_dup 4) (match_dup 5))
-+					 (ltu:SI (reg:CC_C CC_REGNUM)
-+						 (const_int 0))))])]
-+  "
-+  {
-+    operands[3] = gen_highpart (SImode, operands[0]);
-+    operands[0] = gen_lowpart (SImode, operands[0]);
-+    operands[4] = gen_highpart (SImode, operands[1]);
-+    operands[1] = gen_lowpart (SImode, operands[1]);
-+    operands[5] = gen_highpart (SImode, operands[2]);
-+    operands[2] = gen_lowpart (SImode, operands[2]);
-+  }"
-+ [(set_attr "conds" "set")
-+   (set_attr "length" "8")
-+   (set_attr "type" "multiple")]
-+)
-+
-+(define_insn "addsi3_compareV"
-+  [(set (reg:CC_V CC_REGNUM)
-+	(ne:CC_V
-+	  (plus:DI
-+	    (sign_extend:DI (match_operand:SI 1 "register_operand" "r"))
-+	    (sign_extend:DI (match_operand:SI 2 "register_operand" "r")))
-+	  (sign_extend:DI (plus:SI (match_dup 1) (match_dup 2)))))
-+   (set (match_operand:SI 0 "register_operand" "=r")
-+	(plus:SI (match_dup 1) (match_dup 2)))]
-+  "TARGET_32BIT"
-+  "adds%?\\t%0, %1, %2"
-+  [(set_attr "conds" "set")
-+   (set_attr "type" "alus_sreg")]
-+)
-+
-+(define_insn "*addsi3_compareV_upper"
-+  [(set (reg:CC_V CC_REGNUM)
-+	(ne:CC_V
-+	  (plus:DI
-+	   (plus:DI
-+	    (sign_extend:DI (match_operand:SI 1 "register_operand" "r"))
-+	    (sign_extend:DI (match_operand:SI 2 "register_operand" "r")))
-+	   (ltu:DI (reg:CC_C CC_REGNUM) (const_int 0)))
-+	  (plus:DI (sign_extend:DI
-+		    (plus:SI (match_dup 1) (match_dup 2)))
-+		   (ltu:DI (reg:CC_C CC_REGNUM) (const_int 0)))))
-+   (set (match_operand:SI 0 "register_operand" "=r")
-+	(plus:SI
-+	 (plus:SI (match_dup 1) (match_dup 2))
-+	 (ltu:SI (reg:CC_C CC_REGNUM) (const_int 0))))]
-+  "TARGET_32BIT"
-+  "adcs%?\\t%0, %1, %2"
-+  [(set_attr "conds" "set")
-+   (set_attr "type" "adcs_reg")]
-+)
-+
-+(define_insn_and_split "adddi3_compareC"
-+  [(set (reg:CC_C CC_REGNUM)
-+	(ne:CC_C
-+	  (plus:TI
-+	    (zero_extend:TI (match_operand:DI 1 "register_operand" "r"))
-+	    (zero_extend:TI (match_operand:DI 2 "register_operand" "r")))
-+	  (zero_extend:TI (plus:DI (match_dup 1) (match_dup 2)))))
-+   (set (match_operand:DI 0 "register_operand" "=&r")
-+	(plus:DI (match_dup 1) (match_dup 2)))]
-+  "TARGET_32BIT"
-+  "#"
-+  "&& reload_completed"
-+  [(parallel [(set (reg:CC_C CC_REGNUM)
-+		   (compare:CC_C (plus:SI (match_dup 1) (match_dup 2))
-+				 (match_dup 1)))
-+	      (set (match_dup 0) (plus:SI (match_dup 1) (match_dup 2)))])
-+   (parallel [(set (reg:CC_C CC_REGNUM)
-+		   (ne:CC_C
-+		    (plus:DI (plus:DI
-+			      (zero_extend:DI (match_dup 4))
-+			      (zero_extend:DI (match_dup 5)))
-+			     (ltu:DI (reg:CC_C CC_REGNUM) (const_int 0)))
-+		    (plus:DI (zero_extend:DI
-+			      (plus:SI (match_dup 4) (match_dup 5)))
-+			     (ltu:DI (reg:CC_C CC_REGNUM) (const_int 0)))))
-+	     (set (match_dup 3) (plus:SI
-+				 (plus:SI (match_dup 4) (match_dup 5))
-+				 (ltu:SI (reg:CC_C CC_REGNUM)
-+					 (const_int 0))))])]
-+  "
-+  {
-+    operands[3] = gen_highpart (SImode, operands[0]);
-+    operands[0] = gen_lowpart (SImode, operands[0]);
-+    operands[4] = gen_highpart (SImode, operands[1]);
-+    operands[5] = gen_highpart (SImode, operands[2]);
-+    operands[1] = gen_lowpart (SImode, operands[1]);
-+    operands[2] = gen_lowpart (SImode, operands[2]);
-+  }"
-+ [(set_attr "conds" "set")
-+   (set_attr "length" "8")
-+   (set_attr "type" "multiple")]
-+)
-+
-+(define_insn "*addsi3_compareC_upper"
-+  [(set (reg:CC_C CC_REGNUM)
-+	(ne:CC_C
-+	  (plus:DI
-+	   (plus:DI
-+	    (zero_extend:DI (match_operand:SI 1 "register_operand" "r"))
-+	    (zero_extend:DI (match_operand:SI 2 "register_operand" "r")))
-+	   (ltu:DI (reg:CC_C CC_REGNUM) (const_int 0)))
-+	  (plus:DI (zero_extend:DI
-+		    (plus:SI (match_dup 1) (match_dup 2)))
-+		   (ltu:DI (reg:CC_C CC_REGNUM) (const_int 0)))))
-+   (set (match_operand:SI 0 "register_operand" "=r")
-+	(plus:SI
-+	 (plus:SI (match_dup 1) (match_dup 2))
-+	 (ltu:SI (reg:CC_C CC_REGNUM) (const_int 0))))]
-+  "TARGET_32BIT"
-+  "adcs%?\\t%0, %1, %2"
-+  [(set_attr "conds" "set")
-+   (set_attr "type" "adcs_reg")]
-+)
-+
-+(define_insn "addsi3_compareC"
-+   [(set (reg:CC_C CC_REGNUM)
-+	 (ne:CC_C
-+	  (plus:DI
-+	   (zero_extend:DI (match_operand:SI 1 "register_operand" "r"))
-+	   (zero_extend:DI (match_operand:SI 2 "register_operand" "r")))
-+	  (zero_extend:DI
-+	   (plus:SI (match_dup 1) (match_dup 2)))))
-+    (set (match_operand:SI 0 "register_operand" "=r")
-+	 (plus:SI (match_dup 1) (match_dup 2)))]
-+  "TARGET_32BIT"
-+  "adds%?\\t%0, %1, %2"
-+  [(set_attr "conds" "set")
-+   (set_attr "type" "alus_sreg")]
-+)
-+
- (define_insn "addsi3_compare0"
-   [(set (reg:CC_NOOV CC_REGNUM)
- 	(compare:CC_NOOV
-@@ -866,20 +1059,90 @@
-     (set_attr "type" "adcs_reg")]
- )
- 
-+(define_expand "subv<mode>4"
-+  [(match_operand:SIDI 0 "register_operand")
-+   (match_operand:SIDI 1 "register_operand")
-+   (match_operand:SIDI 2 "register_operand")
-+   (match_operand 3 "")]
-+  "TARGET_32BIT"
-+{
-+  emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], operands[2]));
-+  arm_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
-+
-+  DONE;
-+})
-+
-+(define_expand "usubv<mode>4"
-+  [(match_operand:SIDI 0 "register_operand")
-+   (match_operand:SIDI 1 "register_operand")
-+   (match_operand:SIDI 2 "register_operand")
-+   (match_operand 3 "")]
-+  "TARGET_32BIT"
-+{
-+  emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], operands[2]));
-+  arm_gen_unlikely_cbranch (LTU, CCmode, operands[3]);
-+
-+  DONE;
-+})
-+
-+(define_insn_and_split "subdi3_compare1"
-+  [(set (reg:CC CC_REGNUM)
-+	(compare:CC
-+	  (match_operand:DI 1 "register_operand" "r")
-+	  (match_operand:DI 2 "register_operand" "r")))
-+   (set (match_operand:DI 0 "register_operand" "=&r")
-+	(minus:DI (match_dup 1) (match_dup 2)))]
-+  "TARGET_32BIT"
-+  "#"
-+  "&& reload_completed"
-+  [(parallel [(set (reg:CC CC_REGNUM)
-+		   (compare:CC (match_dup 1) (match_dup 2)))
-+	      (set (match_dup 0) (minus:SI (match_dup 1) (match_dup 2)))])
-+   (parallel [(set (reg:CC CC_REGNUM)
-+		   (compare:CC (match_dup 4) (match_dup 5)))
-+	     (set (match_dup 3) (minus:SI (minus:SI (match_dup 4) (match_dup 5))
-+			       (ltu:SI (reg:CC_C CC_REGNUM) (const_int 0))))])]
-+  {
-+    operands[3] = gen_highpart (SImode, operands[0]);
-+    operands[0] = gen_lowpart (SImode, operands[0]);
-+    operands[4] = gen_highpart (SImode, operands[1]);
-+    operands[1] = gen_lowpart (SImode, operands[1]);
-+    operands[5] = gen_highpart (SImode, operands[2]);
-+    operands[2] = gen_lowpart (SImode, operands[2]);
-+   }
-+  [(set_attr "conds" "set")
-+   (set_attr "length" "8")
-+   (set_attr "type" "multiple")]
-+)
-+
-+(define_insn "subsi3_compare1"
-+  [(set (reg:CC CC_REGNUM)
-+	(compare:CC
-+	  (match_operand:SI 1 "register_operand" "r")
-+	  (match_operand:SI 2 "register_operand" "r")))
-+   (set (match_operand:SI 0 "register_operand" "=r")
-+	(minus:SI (match_dup 1) (match_dup 2)))]
-+  "TARGET_32BIT"
-+  "subs%?\\t%0, %1, %2"
-+  [(set_attr "conds" "set")
-+   (set_attr "type" "alus_sreg")]
-+)
-+
- (define_insn "*subsi3_carryin"
--  [(set (match_operand:SI 0 "s_register_operand" "=r,r")
--        (minus:SI (minus:SI (match_operand:SI 1 "reg_or_int_operand" "r,I")
--                            (match_operand:SI 2 "s_register_operand" "r,r"))
--                  (ltu:SI (reg:CC_C CC_REGNUM) (const_int 0))))]
-+  [(set (match_operand:SI 0 "s_register_operand" "=r,r,r")
-+	(minus:SI (minus:SI (match_operand:SI 1 "reg_or_int_operand" "r,I,Pz")
-+			    (match_operand:SI 2 "s_register_operand" "r,r,r"))
-+		  (ltu:SI (reg:CC_C CC_REGNUM) (const_int 0))))]
-   "TARGET_32BIT"
-   "@
-    sbc%?\\t%0, %1, %2
--   rsc%?\\t%0, %2, %1"
-+   rsc%?\\t%0, %2, %1
-+   sbc%?\\t%0, %2, %2, lsl #1"
-   [(set_attr "conds" "use")
--   (set_attr "arch" "*,a")
-+   (set_attr "arch" "*,a,t2")
-    (set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")
--   (set_attr "type" "adc_reg,adc_imm")]
-+   (set_attr "type" "adc_reg,adc_imm,alu_shift_imm")]
- )
- 
- (define_insn "*subsi3_carryin_const"
-@@ -1895,7 +2158,7 @@
-   [(set (match_operand:SF 0 "s_register_operand" "")
- 	(div:SF (match_operand:SF 1 "s_register_operand" "")
- 		(match_operand:SF 2 "s_register_operand" "")))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "")
- 
- (define_expand "divdf3"
-@@ -2137,13 +2400,13 @@
- 
-           for (i = 9; i <= 31; i++)
- 	    {
--	      if ((((HOST_WIDE_INT) 1) << i) - 1 == INTVAL (operands[2]))
-+	      if ((HOST_WIDE_INT_1 << i) - 1 == INTVAL (operands[2]))
- 	        {
- 	          emit_insn (gen_extzv (operands[0], operands[1], GEN_INT (i),
- 			 	        const0_rtx));
- 	          DONE;
- 	        }
--	      else if ((((HOST_WIDE_INT) 1) << i) - 1
-+	      else if ((HOST_WIDE_INT_1 << i) - 1
- 		       == ~INTVAL (operands[2]))
- 	        {
- 	          rtx shift = GEN_INT (i);
-@@ -2442,7 +2705,7 @@
-   {
-     int start_bit = INTVAL (operands[2]);
-     int width = INTVAL (operands[1]);
--    HOST_WIDE_INT mask = (((HOST_WIDE_INT)1) << width) - 1;
-+    HOST_WIDE_INT mask = (HOST_WIDE_INT_1 << width) - 1;
-     rtx target, subtarget;
- 
-     if (arm_arch_thumb2)
-@@ -3050,7 +3313,14 @@
- 	(xor:DI (match_operand:DI 1 "s_register_operand" "")
- 		(match_operand:DI 2 "arm_xordi_operand" "")))]
-   "TARGET_32BIT"
--  ""
-+  {
-+    /* The iWMMXt pattern for xordi3 accepts only register operands but we want
-+       to reuse this expander for all TARGET_32BIT targets so just force the
-+       constants into a register.  Unlike for the anddi3 and iordi3 there are
-+       no NEON instructions that take an immediate.  */
-+    if (TARGET_IWMMXT && !REG_P (operands[2]))
-+      operands[2] = force_reg (DImode, operands[2]);
-+  }
- )
- 
- (define_insn_and_split "*xordi3_insn"
-@@ -3744,8 +4014,7 @@
-     {
-       rtx scratch1, scratch2;
- 
--      if (CONST_INT_P (operands[2])
--	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
-+      if (operands[2] == CONST1_RTX (SImode))
-         {
-           emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1]));
-           DONE;
-@@ -3790,7 +4059,7 @@
-   "TARGET_EITHER"
-   "
-   if (CONST_INT_P (operands[2])
--      && ((unsigned HOST_WIDE_INT) INTVAL (operands[2])) > 31)
-+      && (UINTVAL (operands[2])) > 31)
-     {
-       emit_insn (gen_movsi (operands[0], const0_rtx));
-       DONE;
-@@ -3818,8 +4087,7 @@
-     {
-       rtx scratch1, scratch2;
- 
--      if (CONST_INT_P (operands[2])
--	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
-+      if (operands[2] == CONST1_RTX (SImode))
-         {
-           emit_insn (gen_arm_ashrdi3_1bit (operands[0], operands[1]));
-           DONE;
-@@ -3864,7 +4132,7 @@
-   "TARGET_EITHER"
-   "
-   if (CONST_INT_P (operands[2])
--      && ((unsigned HOST_WIDE_INT) INTVAL (operands[2])) > 31)
-+      && UINTVAL (operands[2]) > 31)
-     operands[2] = GEN_INT (31);
-   "
- )
-@@ -3889,8 +4157,7 @@
-     {
-       rtx scratch1, scratch2;
- 
--      if (CONST_INT_P (operands[2])
--	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
-+      if (operands[2] == CONST1_RTX (SImode))
-         {
-           emit_insn (gen_arm_lshrdi3_1bit (operands[0], operands[1]));
-           DONE;
-@@ -3935,7 +4202,7 @@
-   "TARGET_EITHER"
-   "
-   if (CONST_INT_P (operands[2])
--      && ((unsigned HOST_WIDE_INT) INTVAL (operands[2])) > 31)
-+      && (UINTVAL (operands[2])) > 31)
-     {
-       emit_insn (gen_movsi (operands[0], const0_rtx));
-       DONE;
-@@ -3969,7 +4236,7 @@
-   if (TARGET_32BIT)
-     {
-       if (CONST_INT_P (operands[2])
--          && ((unsigned HOST_WIDE_INT) INTVAL (operands[2])) > 31)
-+          && UINTVAL (operands[2]) > 31)
-         operands[2] = GEN_INT (INTVAL (operands[2]) % 32);
-     }
-   else /* TARGET_THUMB1 */
-@@ -4300,9 +4567,11 @@
- (define_insn "*extv_reg"
-   [(set (match_operand:SI 0 "s_register_operand" "=r")
- 	(sign_extract:SI (match_operand:SI 1 "s_register_operand" "r")
--                         (match_operand:SI 2 "const_int_M_operand" "M")
--                         (match_operand:SI 3 "const_int_M_operand" "M")))]
--  "arm_arch_thumb2"
-+			  (match_operand:SI 2 "const_int_operand" "n")
-+			  (match_operand:SI 3 "const_int_operand" "n")))]
-+  "arm_arch_thumb2
-+   && IN_RANGE (INTVAL (operands[3]), 0, 31)
-+   && IN_RANGE (INTVAL (operands[2]), 1, 32 - INTVAL (operands[3]))"
-   "sbfx%?\t%0, %1, %3, %2"
-   [(set_attr "length" "4")
-    (set_attr "predicable" "yes")
-@@ -4313,9 +4582,11 @@
- (define_insn "extzv_t2"
-   [(set (match_operand:SI 0 "s_register_operand" "=r")
- 	(zero_extract:SI (match_operand:SI 1 "s_register_operand" "r")
--                         (match_operand:SI 2 "const_int_M_operand" "M")
--                         (match_operand:SI 3 "const_int_M_operand" "M")))]
--  "arm_arch_thumb2"
-+			  (match_operand:SI 2 "const_int_operand" "n")
-+			  (match_operand:SI 3 "const_int_operand" "n")))]
-+  "arm_arch_thumb2
-+   && IN_RANGE (INTVAL (operands[3]), 0, 31)
-+   && IN_RANGE (INTVAL (operands[2]), 1, 32 - INTVAL (operands[3]))"
-   "ubfx%?\t%0, %1, %3, %2"
-   [(set_attr "length" "4")
-    (set_attr "predicable" "yes")
-@@ -4326,23 +4597,29 @@
- 
- ;; Division instructions
- (define_insn "divsi3"
--  [(set (match_operand:SI	  0 "s_register_operand" "=r")
--	(div:SI (match_operand:SI 1 "s_register_operand"  "r")
--		(match_operand:SI 2 "s_register_operand"  "r")))]
-+  [(set (match_operand:SI	  0 "s_register_operand" "=r,r")
-+	(div:SI (match_operand:SI 1 "s_register_operand"  "r,r")
-+		(match_operand:SI 2 "s_register_operand"  "r,r")))]
-   "TARGET_IDIV"
--  "sdiv%?\t%0, %1, %2"
--  [(set_attr "predicable" "yes")
-+  "@
-+   sdiv%?\t%0, %1, %2
-+   sdiv\t%0, %1, %2"
-+  [(set_attr "arch" "32,v8mb")
-+   (set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")
-    (set_attr "type" "sdiv")]
- )
- 
- (define_insn "udivsi3"
--  [(set (match_operand:SI	   0 "s_register_operand" "=r")
--	(udiv:SI (match_operand:SI 1 "s_register_operand"  "r")
--		 (match_operand:SI 2 "s_register_operand"  "r")))]
-+  [(set (match_operand:SI	   0 "s_register_operand" "=r,r")
-+	(udiv:SI (match_operand:SI 1 "s_register_operand"  "r,r")
-+		 (match_operand:SI 2 "s_register_operand"  "r,r")))]
-   "TARGET_IDIV"
--  "udiv%?\t%0, %1, %2"
--  [(set_attr "predicable" "yes")
-+  "@
-+   udiv%?\t%0, %1, %2
-+   udiv\t%0, %1, %2"
-+  [(set_attr "arch" "32,v8mb")
-+   (set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")
-    (set_attr "type" "udiv")]
- )
-@@ -4350,6 +4627,63 @@
- 
- ;; Unary arithmetic insns
- 
-+(define_expand "negvsi3"
-+  [(match_operand:SI 0 "register_operand")
-+   (match_operand:SI 1 "register_operand")
-+   (match_operand 2 "")]
-+  "TARGET_32BIT"
-+{
-+  emit_insn (gen_subsi3_compare (operands[0], const0_rtx, operands[1]));
-+  arm_gen_unlikely_cbranch (NE, CC_Vmode, operands[2]);
-+
-+  DONE;
-+})
-+
-+(define_expand "negvdi3"
-+  [(match_operand:DI 0 "register_operand")
-+   (match_operand:DI 1 "register_operand")
-+   (match_operand 2 "")]
-+  "TARGET_ARM"
-+{
-+  emit_insn (gen_negdi2_compare (operands[0], operands[1]));
-+  arm_gen_unlikely_cbranch (NE, CC_Vmode, operands[2]);
-+
-+  DONE;
-+})
-+
-+
-+(define_insn_and_split "negdi2_compare"
-+  [(set (reg:CC CC_REGNUM)
-+	(compare:CC
-+	  (const_int 0)
-+	  (match_operand:DI 1 "register_operand" "0,r")))
-+   (set (match_operand:DI 0 "register_operand" "=r,&r")
-+	(minus:DI (const_int 0) (match_dup 1)))]
-+  "TARGET_ARM"
-+  "#"
-+  "&& reload_completed"
-+  [(parallel [(set (reg:CC CC_REGNUM)
-+		   (compare:CC (const_int 0) (match_dup 1)))
-+	      (set (match_dup 0) (minus:SI (const_int 0)
-+					   (match_dup 1)))])
-+   (parallel [(set (reg:CC CC_REGNUM)
-+		   (compare:CC (const_int 0) (match_dup 3)))
-+	     (set (match_dup 2)
-+		  (minus:SI
-+		   (minus:SI (const_int 0) (match_dup 3))
-+		   (ltu:SI (reg:CC_C CC_REGNUM)
-+			   (const_int 0))))])]
-+  {
-+    operands[2] = gen_highpart (SImode, operands[0]);
-+    operands[0] = gen_lowpart (SImode, operands[0]);
-+    operands[3] = gen_highpart (SImode, operands[1]);
-+    operands[1] = gen_lowpart (SImode, operands[1]);
-+  }
-+  [(set_attr "conds" "set")
-+   (set_attr "length" "8")
-+   (set_attr "type" "multiple")]
-+)
-+
- (define_expand "negdi2"
-  [(parallel
-    [(set (match_operand:DI 0 "s_register_operand" "")
-@@ -4367,12 +4701,13 @@
- 
- ;; The constraints here are to prevent a *partial* overlap (where %Q0 == %R1).
- ;; The first alternative allows the common case of a *full* overlap.
--(define_insn_and_split "*arm_negdi2"
-+(define_insn_and_split "*negdi2_insn"
-   [(set (match_operand:DI         0 "s_register_operand" "=r,&r")
- 	(neg:DI (match_operand:DI 1 "s_register_operand"  "0,r")))
-    (clobber (reg:CC CC_REGNUM))]
--  "TARGET_ARM"
--  "#"   ; "rsbs\\t%Q0, %Q1, #0\;rsc\\t%R0, %R1, #0"
-+  "TARGET_32BIT"
-+  "#"	; rsbs %Q0, %Q1, #0; rsc %R0, %R1, #0	       (ARM)
-+	; negs %Q0, %Q1    ; sbc %R0, %R1, %R1, lsl #1 (Thumb-2)
-   "&& reload_completed"
-   [(parallel [(set (reg:CC CC_REGNUM)
- 		   (compare:CC (const_int 0) (match_dup 1)))
-@@ -4390,6 +4725,20 @@
-    (set_attr "type" "multiple")]
- )
- 
-+(define_insn "*negsi2_carryin_compare"
-+  [(set (reg:CC CC_REGNUM)
-+	(compare:CC (const_int 0)
-+		    (match_operand:SI 1 "s_register_operand" "r")))
-+   (set (match_operand:SI 0 "s_register_operand" "=r")
-+	(minus:SI (minus:SI (const_int 0)
-+			    (match_dup 1))
-+		  (ltu:SI (reg:CC_C CC_REGNUM) (const_int 0))))]
-+  "TARGET_ARM"
-+  "rscs\\t%0, %1, #0"
-+  [(set_attr "conds" "set")
-+   (set_attr "type" "alus_imm")]
-+)
-+
- (define_expand "negsi2"
-   [(set (match_operand:SI         0 "s_register_operand" "")
- 	(neg:SI (match_operand:SI 1 "s_register_operand" "")))]
-@@ -4412,7 +4761,7 @@
- (define_expand "negsf2"
-   [(set (match_operand:SF         0 "s_register_operand" "")
- 	(neg:SF (match_operand:SF 1 "s_register_operand" "")))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   ""
- )
- 
-@@ -4685,7 +5034,7 @@
- (define_expand "sqrtsf2"
-   [(set (match_operand:SF 0 "s_register_operand" "")
- 	(sqrt:SF (match_operand:SF 1 "s_register_operand" "")))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "")
- 
- (define_expand "sqrtdf2"
-@@ -4854,7 +5203,7 @@
-   ""
- )
- 
--/* DFmode -> HFmode conversions have to go through SFmode.  */
-+;; DFmode to HFmode conversions have to go through SFmode.
- (define_expand "truncdfhf2"
-   [(set (match_operand:HF  0 "general_operand" "")
- 	(float_truncate:HF
-@@ -5117,7 +5466,7 @@
- 		     (match_operator 5 "subreg_lowpart_operator"
- 		      [(match_operand:SI 4 "s_register_operand" "")]))))]
-   "TARGET_32BIT
--   && ((unsigned HOST_WIDE_INT) INTVAL (operands[3])
-+   && (UINTVAL (operands[3])
-        == (GET_MODE_MASK (GET_MODE (operands[5]))
-            & (GET_MODE_MASK (GET_MODE (operands[5]))
- 	      << (INTVAL (operands[2])))))"
-@@ -5361,7 +5710,7 @@
-   ""
- )
- 
--/* HFmode -> DFmode conversions have to go through SFmode.  */
-+;; HFmode -> DFmode conversions have to go through SFmode.
- (define_expand "extendhfdf2"
-   [(set (match_operand:DF                  0 "general_operand" "")
- 	(float_extend:DF (match_operand:HF 1 "general_operand"  "")))]
-@@ -5490,7 +5839,7 @@
-   [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r, r, r, q, m")
- 	(match_operand:DI 1 "di_operand"              "rDa,Db,Dc,mi,q"))]
-   "TARGET_32BIT
--   && !(TARGET_HARD_FLOAT && TARGET_VFP)
-+   && !(TARGET_HARD_FLOAT)
-    && !TARGET_IWMMXT
-    && (   register_operand (operands[0], DImode)
-        || register_operand (operands[1], DImode))"
-@@ -5699,12 +6048,15 @@
- ;; LO_SUM adds in the high bits.  Fortunately these are opaque operations
- ;; so this does not matter.
- (define_insn "*arm_movt"
--  [(set (match_operand:SI 0 "nonimmediate_operand" "=r")
--	(lo_sum:SI (match_operand:SI 1 "nonimmediate_operand" "0")
--		   (match_operand:SI 2 "general_operand"      "i")))]
--  "arm_arch_thumb2 && arm_valid_symbolic_address_p (operands[2])"
--  "movt%?\t%0, #:upper16:%c2"
--  [(set_attr "predicable" "yes")
-+  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,r")
-+	(lo_sum:SI (match_operand:SI 1 "nonimmediate_operand" "0,0")
-+		   (match_operand:SI 2 "general_operand"      "i,i")))]
-+  "TARGET_HAVE_MOVT && arm_valid_symbolic_address_p (operands[2])"
-+  "@
-+   movt%?\t%0, #:upper16:%c2
-+   movt\t%0, #:upper16:%c2"
-+  [(set_attr "arch"  "32,v8mb")
-+   (set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")
-    (set_attr "length" "4")
-    (set_attr "type" "alu_sreg")]
-@@ -5713,8 +6065,7 @@
- (define_insn "*arm_movsi_insn"
-   [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,r,r,rk,m")
- 	(match_operand:SI 1 "general_operand"      "rk, I,K,j,mi,rk"))]
--  "TARGET_ARM && ! TARGET_IWMMXT
--   && !(TARGET_HARD_FLOAT && TARGET_VFP)
-+  "TARGET_ARM && !TARGET_IWMMXT && !TARGET_HARD_FLOAT
-    && (   register_operand (operands[0], SImode)
-        || register_operand (operands[1], SImode))"
-   "@
-@@ -5726,6 +6077,7 @@
-    str%?\\t%1, %0"
-   [(set_attr "type" "mov_reg,mov_imm,mvn_imm,mov_imm,load1,store1")
-    (set_attr "predicable" "yes")
-+   (set_attr "arch" "*,*,*,v6t2,*,*")
-    (set_attr "pool_range" "*,*,*,*,4096,*")
-    (set_attr "neg_pool_range" "*,*,*,*,4084,*")]
- )
-@@ -5762,7 +6114,8 @@
-   [(set (match_operand:SI 0 "arm_general_register_operand" "")
- 	(const:SI (plus:SI (match_operand:SI 1 "general_operand" "")
- 			   (match_operand:SI 2 "const_int_operand" ""))))]
--  "TARGET_THUMB2
-+  "TARGET_THUMB
-+   && TARGET_HAVE_MOVT
-    && arm_disable_literal_pool
-    && reload_completed
-    && GET_CODE (operands[1]) == SYMBOL_REF"
-@@ -5793,8 +6146,7 @@
- (define_split
-   [(set (match_operand:SI 0 "arm_general_register_operand" "")
-        (match_operand:SI 1 "general_operand" ""))]
--  "TARGET_32BIT
--   && TARGET_USE_MOVT && GET_CODE (operands[1]) == SYMBOL_REF
-+  "TARGET_USE_MOVT && GET_CODE (operands[1]) == SYMBOL_REF
-    && !flag_pic && !target_word_relocations
-    && !arm_tls_referenced_p (operands[1])"
-   [(clobber (const_int 0))]
-@@ -6362,7 +6714,7 @@
-   [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,m,r")
- 	(match_operand:HI 1 "general_operand"      "rIk,K,n,r,mi"))]
-   "TARGET_ARM
--   && arm_arch4
-+   && arm_arch4 && !TARGET_HARD_FLOAT
-    && (register_operand (operands[0], HImode)
-        || register_operand (operands[1], HImode))"
-   "@
-@@ -6388,7 +6740,7 @@
- (define_insn "*movhi_bytes"
-   [(set (match_operand:HI 0 "s_register_operand" "=r,r,r")
- 	(match_operand:HI 1 "arm_rhs_operand"  "I,rk,K"))]
--  "TARGET_ARM"
-+  "TARGET_ARM && !TARGET_HARD_FLOAT"
-   "@
-    mov%?\\t%0, %1\\t%@ movhi
-    mov%?\\t%0, %1\\t%@ movhi
-@@ -6396,7 +6748,7 @@
-   [(set_attr "predicable" "yes")
-    (set_attr "type" "mov_imm,mov_reg,mvn_imm")]
- )
--	
-+
- ;; We use a DImode scratch because we may occasionally need an additional
- ;; temporary if the address isn't offsettable -- push_reload doesn't seem
- ;; to take any notice of the "o" constraints on reload_memory_operand operand.
-@@ -6518,7 +6870,7 @@
-    strb%?\\t%1, %0"
-   [(set_attr "type" "mov_reg,mov_reg,mov_imm,mov_imm,mvn_imm,load1,store1,load1,store1")
-    (set_attr "predicable" "yes")
--   (set_attr "predicable_short_it" "yes,yes,yes,no,no,no,no,no,no")
-+   (set_attr "predicable_short_it" "yes,yes,no,yes,no,no,no,no,no")
-    (set_attr "arch" "t2,any,any,t2,any,t2,t2,any,any")
-    (set_attr "length" "2,4,4,2,4,2,2,4,4")]
- )
-@@ -6548,7 +6900,7 @@
- (define_insn "*arm32_movhf"
-   [(set (match_operand:HF 0 "nonimmediate_operand" "=r,m,r,r")
- 	(match_operand:HF 1 "general_operand"	   " m,r,r,F"))]
--  "TARGET_32BIT && !(TARGET_HARD_FLOAT && TARGET_FP16)
-+  "TARGET_32BIT && !TARGET_HARD_FLOAT
-    && (	  s_register_operand (operands[0], HFmode)
-        || s_register_operand (operands[1], HFmode))"
-   "*
-@@ -6892,7 +7244,7 @@
-   [(set (pc) (if_then_else
- 	      (match_operator 0 "expandable_comparison_operator"
- 	       [(match_operand:SF 1 "s_register_operand" "")
--	        (match_operand:SF 2 "arm_float_compare_operand" "")])
-+	        (match_operand:SF 2 "vfp_compare_operand" "")])
- 	      (label_ref (match_operand 3 "" ""))
- 	      (pc)))]
-   "TARGET_32BIT && TARGET_HARD_FLOAT"
-@@ -6904,7 +7256,7 @@
-   [(set (pc) (if_then_else
- 	      (match_operator 0 "expandable_comparison_operator"
- 	       [(match_operand:DF 1 "s_register_operand" "")
--	        (match_operand:DF 2 "arm_float_compare_operand" "")])
-+	        (match_operand:DF 2 "vfp_compare_operand" "")])
- 	      (label_ref (match_operand 3 "" ""))
- 	      (pc)))]
-   "TARGET_32BIT && TARGET_HARD_FLOAT && !TARGET_VFP_SINGLE"
-@@ -7366,11 +7718,29 @@
-   DONE;
- }")
- 
-+(define_expand "cstorehf4"
-+  [(set (match_operand:SI 0 "s_register_operand")
-+	(match_operator:SI 1 "expandable_comparison_operator"
-+	 [(match_operand:HF 2 "s_register_operand")
-+	  (match_operand:HF 3 "vfp_compare_operand")]))]
-+  "TARGET_VFP_FP16INST"
-+  {
-+    if (!arm_validize_comparison (&operands[1],
-+				  &operands[2],
-+				  &operands[3]))
-+       FAIL;
-+
-+    emit_insn (gen_cstore_cc (operands[0], operands[1],
-+			      operands[2], operands[3]));
-+    DONE;
-+  }
-+)
-+
- (define_expand "cstoresf4"
-   [(set (match_operand:SI 0 "s_register_operand" "")
- 	(match_operator:SI 1 "expandable_comparison_operator"
- 	 [(match_operand:SF 2 "s_register_operand" "")
--	  (match_operand:SF 3 "arm_float_compare_operand" "")]))]
-+	  (match_operand:SF 3 "vfp_compare_operand" "")]))]
-   "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "emit_insn (gen_cstore_cc (operands[0], operands[1],
- 			     operands[2], operands[3])); DONE;"
-@@ -7380,7 +7750,7 @@
-   [(set (match_operand:SI 0 "s_register_operand" "")
- 	(match_operator:SI 1 "expandable_comparison_operator"
- 	 [(match_operand:DF 2 "s_register_operand" "")
--	  (match_operand:DF 3 "arm_float_compare_operand" "")]))]
-+	  (match_operand:DF 3 "vfp_compare_operand" "")]))]
-   "TARGET_32BIT && TARGET_HARD_FLOAT && !TARGET_VFP_SINGLE"
-   "emit_insn (gen_cstore_cc (operands[0], operands[1],
- 			     operands[2], operands[3])); DONE;"
-@@ -7418,9 +7788,31 @@
-     rtx ccreg;
- 
-     if (!arm_validize_comparison (&operands[1], &XEXP (operands[1], 0), 
--       				  &XEXP (operands[1], 1)))
-+				  &XEXP (operands[1], 1)))
-       FAIL;
--    
-+
-+    code = GET_CODE (operands[1]);
-+    ccreg = arm_gen_compare_reg (code, XEXP (operands[1], 0),
-+				 XEXP (operands[1], 1), NULL_RTX);
-+    operands[1] = gen_rtx_fmt_ee (code, VOIDmode, ccreg, const0_rtx);
-+  }"
-+)
-+
-+(define_expand "movhfcc"
-+  [(set (match_operand:HF 0 "s_register_operand")
-+	(if_then_else:HF (match_operand 1 "arm_cond_move_operator")
-+			 (match_operand:HF 2 "s_register_operand")
-+			 (match_operand:HF 3 "s_register_operand")))]
-+  "TARGET_VFP_FP16INST"
-+  "
-+  {
-+    enum rtx_code code = GET_CODE (operands[1]);
-+    rtx ccreg;
-+
-+    if (!arm_validize_comparison (&operands[1], &XEXP (operands[1], 0),
-+				  &XEXP (operands[1], 1)))
-+      FAIL;
-+
-     code = GET_CODE (operands[1]);
-     ccreg = arm_gen_compare_reg (code, XEXP (operands[1], 0),
- 				 XEXP (operands[1], 1), NULL_RTX);
-@@ -7439,7 +7831,7 @@
-     enum rtx_code code = GET_CODE (operands[1]);
-     rtx ccreg;
- 
--    if (!arm_validize_comparison (&operands[1], &XEXP (operands[1], 0), 
-+    if (!arm_validize_comparison (&operands[1], &XEXP (operands[1], 0),
-        				  &XEXP (operands[1], 1)))
-        FAIL;
- 
-@@ -7504,6 +7896,37 @@
-    (set_attr "type" "fcsel")]
- )
- 
-+(define_insn "*cmovhf"
-+    [(set (match_operand:HF 0 "s_register_operand" "=t")
-+	(if_then_else:HF (match_operator 1 "arm_vsel_comparison_operator"
-+			 [(match_operand 2 "cc_register" "") (const_int 0)])
-+			  (match_operand:HF 3 "s_register_operand" "t")
-+			  (match_operand:HF 4 "s_register_operand" "t")))]
-+  "TARGET_VFP_FP16INST"
-+  "*
-+  {
-+    enum arm_cond_code code = maybe_get_arm_condition_code (operands[1]);
-+    switch (code)
-+      {
-+      case ARM_GE:
-+      case ARM_GT:
-+      case ARM_EQ:
-+      case ARM_VS:
-+	return \"vsel%d1.f16\\t%0, %3, %4\";
-+      case ARM_LT:
-+      case ARM_LE:
-+      case ARM_NE:
-+      case ARM_VC:
-+	return \"vsel%D1.f16\\t%0, %4, %3\";
-+      default:
-+	gcc_unreachable ();
-+      }
-+    return \"\";
-+  }"
-+  [(set_attr "conds" "use")
-+   (set_attr "type" "fcsel")]
-+)
-+
- (define_insn_and_split "*movsicc_insn"
-   [(set (match_operand:SI 0 "s_register_operand" "=r,r,r,r,r,r,r,r")
- 	(if_then_else:SI
-@@ -7627,6 +8050,7 @@
-   "
-   {
-     rtx callee, pat;
-+    tree addr = MEM_EXPR (operands[0]);
-     
-     /* In an untyped call, we can get NULL for operand 2.  */
-     if (operands[2] == NULL_RTX)
-@@ -7641,8 +8065,17 @@
- 	: !REG_P (callee))
-       XEXP (operands[0], 0) = force_reg (Pmode, callee);
- 
--    pat = gen_call_internal (operands[0], operands[1], operands[2]);
--    arm_emit_call_insn (pat, XEXP (operands[0], 0), false);
-+    if (detect_cmse_nonsecure_call (addr))
-+      {
-+	pat = gen_nonsecure_call_internal (operands[0], operands[1],
-+					   operands[2]);
-+	emit_call_insn (pat);
-+      }
-+    else
-+      {
-+	pat = gen_call_internal (operands[0], operands[1], operands[2]);
-+	arm_emit_call_insn (pat, XEXP (operands[0], 0), false);
-+      }
-     DONE;
-   }"
- )
-@@ -7653,6 +8086,24 @@
- 	      (use (match_operand 2 "" ""))
- 	      (clobber (reg:SI LR_REGNUM))])])
- 
-+(define_expand "nonsecure_call_internal"
-+  [(parallel [(call (unspec:SI [(match_operand 0 "memory_operand" "")]
-+			       UNSPEC_NONSECURE_MEM)
-+		    (match_operand 1 "general_operand" ""))
-+	      (use (match_operand 2 "" ""))
-+	      (clobber (reg:SI LR_REGNUM))
-+	      (clobber (reg:SI 4))])]
-+  "use_cmse"
-+  "
-+  {
-+    rtx tmp;
-+    tmp = copy_to_suggested_reg (XEXP (operands[0], 0),
-+				 gen_rtx_REG (SImode, 4),
-+				 SImode);
-+
-+    operands[0] = replace_equiv_address (operands[0], tmp);
-+  }")
-+
- (define_insn "*call_reg_armv5"
-   [(call (mem:SI (match_operand:SI 0 "s_register_operand" "r"))
-          (match_operand 1 "" ""))
-@@ -7688,6 +8139,7 @@
-   "
-   {
-     rtx pat, callee;
-+    tree addr = MEM_EXPR (operands[1]);
-     
-     /* In an untyped call, we can get NULL for operand 2.  */
-     if (operands[3] == 0)
-@@ -7702,9 +8154,18 @@
- 	: !REG_P (callee))
-       XEXP (operands[1], 0) = force_reg (Pmode, callee);
- 
--    pat = gen_call_value_internal (operands[0], operands[1],
--				   operands[2], operands[3]);
--    arm_emit_call_insn (pat, XEXP (operands[1], 0), false);
-+    if (detect_cmse_nonsecure_call (addr))
-+      {
-+	pat = gen_nonsecure_call_value_internal (operands[0], operands[1],
-+						 operands[2], operands[3]);
-+	emit_call_insn (pat);
-+      }
-+    else
-+      {
-+	pat = gen_call_value_internal (operands[0], operands[1],
-+				       operands[2], operands[3]);
-+	arm_emit_call_insn (pat, XEXP (operands[1], 0), false);
-+      }
-     DONE;
-   }"
- )
-@@ -7716,6 +8177,25 @@
- 	      (use (match_operand 3 "" ""))
- 	      (clobber (reg:SI LR_REGNUM))])])
- 
-+(define_expand "nonsecure_call_value_internal"
-+  [(parallel [(set (match_operand       0 "" "")
-+		   (call (unspec:SI [(match_operand 1 "memory_operand" "")]
-+				    UNSPEC_NONSECURE_MEM)
-+			 (match_operand 2 "general_operand" "")))
-+	      (use (match_operand 3 "" ""))
-+	      (clobber (reg:SI LR_REGNUM))
-+	      (clobber (reg:SI 4))])]
-+  "use_cmse"
-+  "
-+  {
-+    rtx tmp;
-+    tmp = copy_to_suggested_reg (XEXP (operands[1], 0),
-+				 gen_rtx_REG (SImode, 4),
-+				 SImode);
-+
-+    operands[1] = replace_equiv_address (operands[1], tmp);
-+  }")
-+
- (define_insn "*call_value_reg_armv5"
-   [(set (match_operand 0 "" "")
-         (call (mem:SI (match_operand:SI 1 "s_register_operand" "r"))
-@@ -8153,8 +8633,8 @@
- )
- 
- (define_insn "probe_stack"
--  [(set (match_operand 0 "memory_operand" "=m")
--        (unspec [(const_int 0)] UNSPEC_PROBE_STACK))]
-+  [(set (match_operand:SI 0 "memory_operand" "=m")
-+        (unspec:SI [(const_int 0)] UNSPEC_PROBE_STACK))]
-   "TARGET_32BIT"
-   "str%?\\tr0, %0"
-   [(set_attr "type" "store1")
-@@ -10221,8 +10701,8 @@
- 	 (match_operand 1 "const_int_operand" "")))
-    (clobber (match_scratch:SI 2 ""))]
-   "TARGET_ARM
--   && (((unsigned HOST_WIDE_INT) INTVAL (operands[1]))
--       == (((unsigned HOST_WIDE_INT) INTVAL (operands[1])) >> 24) << 24)"
-+   && ((UINTVAL (operands[1]))
-+       == ((UINTVAL (operands[1])) >> 24) << 24)"
-   [(set (match_dup 2) (zero_extend:SI (match_dup 0)))
-    (set (reg:CC CC_REGNUM) (compare:CC (match_dup 2) (match_dup 1)))]
-   "
-@@ -10562,7 +11042,11 @@
-   }
-   "
-   [(set_attr "type" "load4")
--   (set_attr "predicable" "yes")]
-+   (set_attr "predicable" "yes")
-+   (set (attr "length")
-+	(symbol_ref "arm_attr_length_pop_multi (operands,
-+						/*return_pc=*/false,
-+						/*write_back_p=*/true)"))]
- )
- 
- ;; Pop with return (as used in epilogue RTL)
-@@ -10591,7 +11075,10 @@
-   }
-   "
-   [(set_attr "type" "load4")
--   (set_attr "predicable" "yes")]
-+   (set_attr "predicable" "yes")
-+   (set (attr "length")
-+	(symbol_ref "arm_attr_length_pop_multi (operands, /*return_pc=*/true,
-+						/*write_back_p=*/true)"))]
- )
- 
- (define_insn "*pop_multiple_with_return"
-@@ -10611,7 +11098,10 @@
-   }
-   "
-   [(set_attr "type" "load4")
--   (set_attr "predicable" "yes")]
-+   (set_attr "predicable" "yes")
-+   (set (attr "length")
-+	(symbol_ref "arm_attr_length_pop_multi (operands, /*return_pc=*/true,
-+						/*write_back_p=*/false)"))]
- )
- 
- ;; Load into PC and return
-@@ -10632,7 +11122,7 @@
-                    (match_operand:SI 2 "const_int_I_operand" "I")))
-      (set (match_operand:DF 3 "vfp_hard_register_operand" "")
-           (mem:DF (match_dup 1)))])]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "*
-   {
-     int num_regs = XVECLEN (operands[0], 0);
-@@ -10822,19 +11312,22 @@
-    (set_attr "predicable_short_it" "no")
-    (set_attr "type" "clz")])
- 
--(define_expand "ctzsi2"
-- [(set (match_operand:SI           0 "s_register_operand" "")
--       (ctz:SI (match_operand:SI  1 "s_register_operand" "")))]
-+;; Keep this as a CTZ expression until after reload and then split
-+;; into RBIT + CLZ.  Since RBIT is represented as an UNSPEC it is unlikely
-+;; to fold with any other expression.
-+
-+(define_insn_and_split "ctzsi2"
-+ [(set (match_operand:SI           0 "s_register_operand" "=r")
-+       (ctz:SI (match_operand:SI  1 "s_register_operand" "r")))]
-   "TARGET_32BIT && arm_arch_thumb2"
-+  "#"
-+  "&& reload_completed"
-+  [(const_int 0)]
-   "
--   {
--     rtx tmp = gen_reg_rtx (SImode); 
--     emit_insn (gen_rbitsi2 (tmp, operands[1]));
--     emit_insn (gen_clzsi2 (operands[0], tmp));
--   }
--   DONE;
--  "
--)
-+  emit_insn (gen_rbitsi2 (operands[0], operands[1]));
-+  emit_insn (gen_clzsi2 (operands[0], operands[0]));
-+  DONE;
-+")
- 
- ;; V5E instructions.
- 
-@@ -10958,13 +11451,16 @@
- ;; We only care about the lower 16 bits of the constant 
- ;; being inserted into the upper 16 bits of the register.
- (define_insn "*arm_movtas_ze" 
--  [(set (zero_extract:SI (match_operand:SI 0 "s_register_operand" "+r")
-+  [(set (zero_extract:SI (match_operand:SI 0 "s_register_operand" "+r,r")
-                    (const_int 16)
-                    (const_int 16))
-         (match_operand:SI 1 "const_int_operand" ""))]
--  "arm_arch_thumb2"
--  "movt%?\t%0, %L1"
-- [(set_attr "predicable" "yes")
-+  "TARGET_HAVE_MOVT"
-+  "@
-+   movt%?\t%0, %L1
-+   movt\t%0, %L1"
-+ [(set_attr "arch" "32,v8mb")
-+  (set_attr "predicable" "yes")
-   (set_attr "predicable_short_it" "no")
-   (set_attr "length" "4")
-   (set_attr "type" "alu_sreg")]
---- a/src/gcc/config/arm/arm.opt
-+++ b/src/gcc/config/arm/arm.opt
-@@ -61,10 +61,6 @@ Generate a call to abort if a noreturn function returns.
- mapcs
- Target RejectNegative Mask(APCS_FRAME) Undocumented
- 
--mapcs-float
--Target Report Mask(APCS_FLOAT)
--Pass FP arguments in FP registers.
--
- mapcs-frame
- Target Report Mask(APCS_FRAME)
- Generate APCS conformant stack frames.
-@@ -109,6 +105,10 @@ mfloat-abi=
- Target RejectNegative Joined Enum(float_abi_type) Var(arm_float_abi) Init(TARGET_DEFAULT_FLOAT_ABI)
- Specify if floating point hardware should be used.
- 
-+mcmse
-+Target RejectNegative Var(use_cmse)
-+Specify that the compiler should target secure code as per ARMv8-M Security Extensions.
-+
- Enum
- Name(float_abi_type) Type(enum float_abi_type)
- Known floating-point ABIs (for use with the -mfloat-abi= option):
-@@ -253,14 +253,6 @@ mrestrict-it
- Target Report Var(arm_restrict_it) Init(2) Save
- Generate IT blocks appropriate for ARMv8.
- 
--mold-rtx-costs
--Target Report Mask(OLD_RTX_COSTS)
--Use the old RTX costing tables (transitional).
--
--mnew-generic-costs
--Target Report Mask(NEW_GENERIC_COSTS)
--Use the new generic RTX cost tables if new core-specific cost table not available (transitional).
--
- mfix-cortex-m3-ldrd
- Target Report Var(fix_cm3_ldrd) Init(2)
- Avoid overlapping destination and address registers on LDRD instructions
---- /dev/null
-+++ b/src/gcc/config/arm/arm_cmse.h
-@@ -0,0 +1,199 @@
-+/* ARMv8-M Secure Extensions intrinsics include file.
-+
-+   Copyright (C) 2015-2016 Free Software Foundation, Inc.
-+   Contributed by ARM Ltd.
-+
-+   This file is part of GCC.
-+
-+   GCC is free software; you can redistribute it and/or modify it
-+   under the terms of the GNU General Public License as published
-+   by the Free Software Foundation; either version 3, or (at your
-+   option) any later version.
-+
-+   GCC is distributed in the hope that it will be useful, but WITHOUT
-+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-+   License for more details.
-+
-+   Under Section 7 of GPL version 3, you are granted additional
-+   permissions described in the GCC Runtime Library Exception, version
-+   3.1, as published by the Free Software Foundation.
-+
-+   You should have received a copy of the GNU General Public License and
-+   a copy of the GCC Runtime Library Exception along with this program;
-+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+
-+#ifndef _GCC_ARM_CMSE_H
-+#define _GCC_ARM_CMSE_H
-+
-+#ifdef __cplusplus
-+extern "C" {
-+#endif
-+
-+#if __ARM_FEATURE_CMSE & 1
-+
-+#include <stddef.h>
-+#include <stdint.h>
-+
-+#ifdef __ARM_BIG_ENDIAN
-+
-+typedef union {
-+  struct cmse_address_info {
-+#if __ARM_FEATURE_CMSE & 2
-+    unsigned idau_region:8;
-+    unsigned idau_region_valid:1;
-+    unsigned secure:1;
-+    unsigned nonsecure_readwrite_ok:1;
-+    unsigned nonsecure_read_ok:1;
-+#else
-+    unsigned :12;
-+#endif
-+    unsigned readwrite_ok:1;
-+    unsigned read_ok:1;
-+#if __ARM_FEATURE_CMSE & 2
-+    unsigned sau_region_valid:1;
-+#else
-+    unsigned :1;
-+#endif
-+    unsigned mpu_region_valid:1;
-+#if __ARM_FEATURE_CMSE & 2
-+    unsigned sau_region:8;
-+#else
-+    unsigned :8;
-+#endif
-+    unsigned mpu_region:8;
-+  } flags;
-+  unsigned value;
-+} cmse_address_info_t;
-+
-+#else
-+
-+typedef union {
-+  struct cmse_address_info {
-+    unsigned mpu_region:8;
-+#if __ARM_FEATURE_CMSE & 2
-+    unsigned sau_region:8;
-+#else
-+    unsigned :8;
-+#endif
-+    unsigned mpu_region_valid:1;
-+#if __ARM_FEATURE_CMSE & 2
-+    unsigned sau_region_valid:1;
-+#else
-+    unsigned :1;
-+#endif
-+    unsigned read_ok:1;
-+    unsigned readwrite_ok:1;
-+#if __ARM_FEATURE_CMSE & 2
-+    unsigned nonsecure_read_ok:1;
-+    unsigned nonsecure_readwrite_ok:1;
-+    unsigned secure:1;
-+    unsigned idau_region_valid:1;
-+    unsigned idau_region:8;
-+#else
-+    unsigned :12;
-+#endif
-+  } flags;
-+  unsigned value;
-+} cmse_address_info_t;
-+
-+#endif /* __ARM_BIG_ENDIAN */
-+
-+#define cmse_TT_fptr(p) (__cmse_TT_fptr ((__cmse_fptr)(p)))
-+
-+typedef void (*__cmse_fptr)(void);
-+
-+#define __CMSE_TT_ASM(flags) \
-+{ \
-+  cmse_address_info_t __result; \
-+   __asm__ ("tt" # flags " %0,%1" \
-+	   : "=r"(__result) \
-+	   : "r"(__p) \
-+	   : "memory"); \
-+  return __result; \
-+}
-+
-+__extension__ static __inline __attribute__ ((__always_inline__))
-+cmse_address_info_t
-+__cmse_TT_fptr (__cmse_fptr __p)
-+__CMSE_TT_ASM ()
-+
-+__extension__ static __inline __attribute__ ((__always_inline__))
-+cmse_address_info_t
-+cmse_TT (void *__p)
-+__CMSE_TT_ASM ()
-+
-+#define cmse_TTT_fptr(p) (__cmse_TTT_fptr ((__cmse_fptr)(p)))
-+
-+__extension__ static __inline __attribute__ ((__always_inline__))
-+cmse_address_info_t
-+__cmse_TTT_fptr (__cmse_fptr __p)
-+__CMSE_TT_ASM (t)
-+
-+__extension__ static __inline __attribute__ ((__always_inline__))
-+cmse_address_info_t
-+cmse_TTT (void *__p)
-+__CMSE_TT_ASM (t)
-+
-+#if __ARM_FEATURE_CMSE & 2
-+
-+#define cmse_TTA_fptr(p) (__cmse_TTA_fptr ((__cmse_fptr)(p)))
-+
-+__extension__ static __inline __attribute__ ((__always_inline__))
-+cmse_address_info_t
-+__cmse_TTA_fptr (__cmse_fptr __p)
-+__CMSE_TT_ASM (a)
-+
-+__extension__ static __inline __attribute__ ((__always_inline__))
-+cmse_address_info_t
-+cmse_TTA (void *__p)
-+__CMSE_TT_ASM (a)
-+
-+#define cmse_TTAT_fptr(p) (__cmse_TTAT_fptr ((__cmse_fptr)(p)))
-+
-+__extension__ static __inline cmse_address_info_t
-+__attribute__ ((__always_inline__))
-+__cmse_TTAT_fptr (__cmse_fptr __p)
-+__CMSE_TT_ASM (at)
-+
-+__extension__ static __inline cmse_address_info_t
-+__attribute__ ((__always_inline__))
-+cmse_TTAT (void *__p)
-+__CMSE_TT_ASM (at)
-+
-+/* FIXME: diagnose use outside cmse_nonsecure_entry functions.  */
-+__extension__ static __inline int __attribute__ ((__always_inline__))
-+cmse_nonsecure_caller (void)
-+{
-+  return __builtin_arm_cmse_nonsecure_caller ();
-+}
-+
-+#define CMSE_AU_NONSECURE	2
-+#define CMSE_MPU_NONSECURE	16
-+#define CMSE_NONSECURE		18
-+
-+#define cmse_nsfptr_create(p) ((typeof ((p))) ((intptr_t) (p) & ~1))
-+
-+#define cmse_is_nsfptr(p) (!((intptr_t) (p) & 1))
-+
-+#endif /* __ARM_FEATURE_CMSE & 2 */
-+
-+#define CMSE_MPU_UNPRIV		4
-+#define CMSE_MPU_READWRITE	1
-+#define CMSE_MPU_READ		8
-+
-+__extension__ void *
-+cmse_check_address_range (void *, size_t, int);
-+
-+#define cmse_check_pointed_object(p, f) \
-+  ((typeof ((p))) cmse_check_address_range ((p), sizeof (*(p)), (f)))
-+
-+#endif /* __ARM_FEATURE_CMSE & 1 */
-+
-+#ifdef __cplusplus
-+}
-+#endif
-+
-+#endif /* _GCC_ARM_CMSE_H */
---- /dev/null
-+++ b/src/gcc/config/arm/arm_fp16.h
-@@ -0,0 +1,255 @@
-+/* ARM FP16 intrinsics include file.
-+
-+   Copyright (C) 2016 Free Software Foundation, Inc.
-+   Contributed by ARM Ltd.
-+
-+   This file is part of GCC.
-+
-+   GCC is free software; you can redistribute it and/or modify it
-+   under the terms of the GNU General Public License as published
-+   by the Free Software Foundation; either version 3, or (at your
-+   option) any later version.
-+
-+   GCC is distributed in the hope that it will be useful, but WITHOUT
-+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-+   License for more details.
-+
-+   Under Section 7 of GPL version 3, you are granted additional
-+   permissions described in the GCC Runtime Library Exception, version
-+   3.1, as published by the Free Software Foundation.
-+
-+   You should have received a copy of the GNU General Public License and
-+   a copy of the GCC Runtime Library Exception along with this program;
-+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef _GCC_ARM_FP16_H
-+#define _GCC_ARM_FP16_H 1
-+
-+#ifdef __cplusplus
-+extern "C" {
-+#endif
-+
-+#include <stdint.h>
-+
-+/* Intrinsics for FP16 instructions.  */
-+#pragma GCC push_options
-+#pragma GCC target ("fpu=fp-armv8")
-+
-+#if defined (__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
-+
-+typedef __fp16 float16_t;
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vabsh_f16 (float16_t __a)
-+{
-+  return __builtin_neon_vabshf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vaddh_f16 (float16_t __a, float16_t __b)
-+{
-+  return __a + __b;
-+}
-+
-+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-+vcvtah_s32_f16 (float16_t __a)
-+{
-+  return __builtin_neon_vcvtahssi (__a);
-+}
-+
-+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+vcvtah_u32_f16 (float16_t __a)
-+{
-+  return __builtin_neon_vcvtahusi (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vcvth_f16_s32 (int32_t __a)
-+{
-+  return __builtin_neon_vcvthshf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vcvth_f16_u32 (uint32_t __a)
-+{
-+  return __builtin_neon_vcvthuhf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vcvth_n_f16_s32 (int32_t __a, const int __b)
-+{
-+  return __builtin_neon_vcvths_nhf (__a, __b);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vcvth_n_f16_u32 (uint32_t __a, const int __b)
-+{
-+  return __builtin_neon_vcvthu_nhf ((int32_t)__a, __b);
-+}
-+
-+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-+vcvth_n_s32_f16 (float16_t __a, const int __b)
-+{
-+  return __builtin_neon_vcvths_nsi (__a, __b);
-+}
-+
-+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+vcvth_n_u32_f16 (float16_t __a, const int __b)
-+{
-+  return (uint32_t)__builtin_neon_vcvthu_nsi (__a, __b);
-+}
-+
-+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-+vcvth_s32_f16 (float16_t __a)
-+{
-+  return __builtin_neon_vcvthssi (__a);
-+}
-+
-+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+vcvth_u32_f16 (float16_t __a)
-+{
-+  return __builtin_neon_vcvthusi (__a);
-+}
-+
-+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-+vcvtmh_s32_f16 (float16_t __a)
-+{
-+  return __builtin_neon_vcvtmhssi (__a);
-+}
-+
-+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+vcvtmh_u32_f16 (float16_t __a)
-+{
-+  return __builtin_neon_vcvtmhusi (__a);
-+}
-+
-+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-+vcvtnh_s32_f16 (float16_t __a)
-+{
-+  return __builtin_neon_vcvtnhssi (__a);
-+}
-+
-+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+vcvtnh_u32_f16 (float16_t __a)
-+{
-+  return __builtin_neon_vcvtnhusi (__a);
-+}
-+
-+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-+vcvtph_s32_f16 (float16_t __a)
-+{
-+  return __builtin_neon_vcvtphssi (__a);
-+}
-+
-+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+vcvtph_u32_f16 (float16_t __a)
-+{
-+  return __builtin_neon_vcvtphusi (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vdivh_f16 (float16_t __a, float16_t __b)
-+{
-+  return __a / __b;
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vfmah_f16 (float16_t __a, float16_t __b, float16_t __c)
-+{
-+  return __builtin_neon_vfmahf (__a, __b, __c);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vfmsh_f16 (float16_t __a, float16_t __b, float16_t __c)
-+{
-+  return __builtin_neon_vfmshf (__a, __b, __c);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vmaxnmh_f16 (float16_t __a, float16_t __b)
-+{
-+  return __builtin_neon_vmaxnmhf (__a, __b);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vminnmh_f16 (float16_t __a, float16_t __b)
-+{
-+  return __builtin_neon_vminnmhf (__a, __b);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vmulh_f16 (float16_t __a, float16_t __b)
-+{
-+  return __a * __b;
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vnegh_f16 (float16_t __a)
-+{
-+  return  - __a;
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrndah_f16 (float16_t __a)
-+{
-+  return __builtin_neon_vrndahf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrndh_f16 (float16_t __a)
-+{
-+  return __builtin_neon_vrndhf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrndih_f16 (float16_t __a)
-+{
-+  return __builtin_neon_vrndihf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrndmh_f16 (float16_t __a)
-+{
-+  return __builtin_neon_vrndmhf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrndnh_f16 (float16_t __a)
-+{
-+  return __builtin_neon_vrndnhf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrndph_f16 (float16_t __a)
-+{
-+  return __builtin_neon_vrndphf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vrndxh_f16 (float16_t __a)
-+{
-+  return __builtin_neon_vrndxhf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vsqrth_f16 (float16_t __a)
-+{
-+  return __builtin_neon_vsqrthf (__a);
-+}
-+
-+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
-+vsubh_f16 (float16_t __a, float16_t __b)
-+{
-+  return __a - __b;
-+}
-+
-+#endif /* __ARM_FEATURE_FP16_SCALAR_ARITHMETIC  */
-+#pragma GCC pop_options
-+
-+#ifdef __cplusplus
-+}
-+#endif
-+
-+#endif
---- a/src/gcc/config/arm/arm_neon.h
-+++ b/src/gcc/config/arm/arm_neon.h
-@@ -38,6 +38,7 @@
- extern "C" {
- #endif
- 
-+#include <arm_fp16.h>
- #include <stdint.h>
- 
- typedef __simd64_int8_t int8x8_t;
-@@ -509,528 +510,614 @@ typedef struct poly64x2x4_t
- #pragma GCC pop_options
- 
- /* vadd  */
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vadd_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vadd_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vadd_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vadd_f32 (float32x2_t __a, float32x2_t __b)
- {
--#ifdef __FAST_MATH
-+#ifdef __FAST_MATH__
-   return __a + __b;
- #else
-   return (float32x2_t) __builtin_neon_vaddv2sf (__a, __b);
- #endif
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vadd_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vadd_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vadd_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vadd_s64 (int64x1_t __a, int64x1_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vadd_u64 (uint64x1_t __a, uint64x1_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddq_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddq_f32 (float32x4_t __a, float32x4_t __b)
- {
--#ifdef __FAST_MATH
-+#ifdef __FAST_MATH__
-   return __a + __b;
- #else
-   return (float32x4_t) __builtin_neon_vaddv4sf (__a, __b);
- #endif
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddq_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return __a + __b;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddl_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int16x8_t)__builtin_neon_vaddlsv8qi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddl_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int32x4_t)__builtin_neon_vaddlsv4hi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddl_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int64x2_t)__builtin_neon_vaddlsv2si (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddl_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vaddluv8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddl_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vaddluv4hi ((int16x4_t) __a, (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddl_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint64x2_t)__builtin_neon_vaddluv2si ((int32x2_t) __a, (int32x2_t) __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddw_s8 (int16x8_t __a, int8x8_t __b)
- {
-   return (int16x8_t)__builtin_neon_vaddwsv8qi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddw_s16 (int32x4_t __a, int16x4_t __b)
- {
-   return (int32x4_t)__builtin_neon_vaddwsv4hi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddw_s32 (int64x2_t __a, int32x2_t __b)
- {
-   return (int64x2_t)__builtin_neon_vaddwsv2si (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddw_u8 (uint16x8_t __a, uint8x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vaddwuv8qi ((int16x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddw_u16 (uint32x4_t __a, uint16x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vaddwuv4hi ((int32x4_t) __a, (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddw_u32 (uint64x2_t __a, uint32x2_t __b)
- {
-   return (uint64x2_t)__builtin_neon_vaddwuv2si ((int64x2_t) __a, (int32x2_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vhadd_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int8x8_t)__builtin_neon_vhaddsv8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vhadd_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x4_t)__builtin_neon_vhaddsv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vhadd_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x2_t)__builtin_neon_vhaddsv2si (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vhadd_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vhadduv8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vhadd_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vhadduv4hi ((int16x4_t) __a, (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vhadd_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vhadduv2si ((int32x2_t) __a, (int32x2_t) __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vhaddq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (int8x16_t)__builtin_neon_vhaddsv16qi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vhaddq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int16x8_t)__builtin_neon_vhaddsv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vhaddq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int32x4_t)__builtin_neon_vhaddsv4si (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vhaddq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vhadduv16qi ((int8x16_t) __a, (int8x16_t) __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vhaddq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vhadduv8hi ((int16x8_t) __a, (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vhaddq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vhadduv4si ((int32x4_t) __a, (int32x4_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrhadd_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int8x8_t)__builtin_neon_vrhaddsv8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrhadd_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x4_t)__builtin_neon_vrhaddsv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrhadd_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x2_t)__builtin_neon_vrhaddsv2si (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrhadd_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vrhadduv8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrhadd_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vrhadduv4hi ((int16x4_t) __a, (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrhadd_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vrhadduv2si ((int32x2_t) __a, (int32x2_t) __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrhaddq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (int8x16_t)__builtin_neon_vrhaddsv16qi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrhaddq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int16x8_t)__builtin_neon_vrhaddsv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrhaddq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int32x4_t)__builtin_neon_vrhaddsv4si (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrhaddq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vrhadduv16qi ((int8x16_t) __a, (int8x16_t) __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrhaddq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vrhadduv8hi ((int16x8_t) __a, (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrhaddq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vrhadduv4si ((int32x4_t) __a, (int32x4_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqadd_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int8x8_t)__builtin_neon_vqaddsv8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqadd_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x4_t)__builtin_neon_vqaddsv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqadd_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x2_t)__builtin_neon_vqaddsv2si (__a, __b);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqadd_s64 (int64x1_t __a, int64x1_t __b)
- {
-   return (int64x1_t)__builtin_neon_vqaddsdi (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqadd_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vqadduv8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqadd_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vqadduv4hi ((int16x4_t) __a, (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqadd_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vqadduv2si ((int32x2_t) __a, (int32x2_t) __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqadd_u64 (uint64x1_t __a, uint64x1_t __b)
- {
-   return (uint64x1_t)__builtin_neon_vqaddudi ((int64x1_t) __a, (int64x1_t) __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqaddq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (int8x16_t)__builtin_neon_vqaddsv16qi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqaddq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int16x8_t)__builtin_neon_vqaddsv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqaddq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int32x4_t)__builtin_neon_vqaddsv4si (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqaddq_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return (int64x2_t)__builtin_neon_vqaddsv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqaddq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vqadduv16qi ((int8x16_t) __a, (int8x16_t) __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqaddq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vqadduv8hi ((int16x8_t) __a, (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqaddq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vqadduv4si ((int32x4_t) __a, (int32x4_t) __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqaddq_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return (uint64x2_t)__builtin_neon_vqadduv2di ((int64x2_t) __a, (int64x2_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddhn_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int8x8_t)__builtin_neon_vaddhnv8hi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddhn_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int16x4_t)__builtin_neon_vaddhnv4si (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddhn_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return (int32x2_t)__builtin_neon_vaddhnv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddhn_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vaddhnv8hi ((int16x8_t) __a, (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddhn_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vaddhnv4si ((int32x4_t) __a, (int32x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaddhn_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vaddhnv2di ((int64x2_t) __a, (int64x2_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vraddhn_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int8x8_t)__builtin_neon_vraddhnv8hi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vraddhn_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int16x4_t)__builtin_neon_vraddhnv4si (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vraddhn_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return (int32x2_t)__builtin_neon_vraddhnv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vraddhn_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vraddhnv8hi ((int16x8_t) __a, (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vraddhn_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vraddhnv4si ((int32x4_t) __a, (int32x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vraddhn_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vraddhnv2di ((int64x2_t) __a, (int64x2_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_f32 (float32x2_t __a, float32x2_t __b)
- {
--#ifdef __FAST_MATH
-+#ifdef __FAST_MATH__
-   return __a * __b;
- #else
-   return (float32x2_t) __builtin_neon_vmulfv2sf (__a, __b);
-@@ -1038,493 +1125,574 @@ vmul_f32 (float32x2_t __a, float32x2_t __b)
- 
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_f32 (float32x4_t __a, float32x4_t __b)
- {
--#ifdef __FAST_MATH
-+#ifdef __FAST_MATH__
-   return __a * __b;
- #else
-   return (float32x4_t) __builtin_neon_vmulfv4sf (__a, __b);
- #endif
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return __a * __b;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_p8 (poly8x8_t __a, poly8x8_t __b)
- {
-   return (poly8x8_t)__builtin_neon_vmulpv8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_p8 (poly8x16_t __a, poly8x16_t __b)
- {
-   return (poly8x16_t)__builtin_neon_vmulpv16qi ((int8x16_t) __a, (int8x16_t) __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmulh_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x4_t)__builtin_neon_vqdmulhv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmulh_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x2_t)__builtin_neon_vqdmulhv2si (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmulhq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int16x8_t)__builtin_neon_vqdmulhv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmulhq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int32x4_t)__builtin_neon_vqdmulhv4si (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmulh_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x4_t)__builtin_neon_vqrdmulhv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmulh_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x2_t)__builtin_neon_vqrdmulhv2si (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmulhq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int16x8_t)__builtin_neon_vqrdmulhv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmulhq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int32x4_t)__builtin_neon_vqrdmulhv4si (__a, __b);
- }
- 
- #ifdef __ARM_FEATURE_QRDMX
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmlah_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
- {
-   return (int16x4_t)__builtin_neon_vqrdmlahv4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmlah_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
- {
-   return (int32x2_t)__builtin_neon_vqrdmlahv2si (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmlahq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
- {
-   return (int16x8_t)__builtin_neon_vqrdmlahv8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmlahq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
- {
-   return (int32x4_t)__builtin_neon_vqrdmlahv4si (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmlsh_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
- {
-   return (int16x4_t)__builtin_neon_vqrdmlshv4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmlsh_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
- {
-   return (int32x2_t)__builtin_neon_vqrdmlshv2si (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmlshq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
- {
-   return (int16x8_t)__builtin_neon_vqrdmlshv8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmlshq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
- {
-   return (int32x4_t)__builtin_neon_vqrdmlshv4si (__a, __b, __c);
- }
- #endif
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmull_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int16x8_t)__builtin_neon_vmullsv8qi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmull_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int32x4_t)__builtin_neon_vmullsv4hi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmull_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int64x2_t)__builtin_neon_vmullsv2si (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmull_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vmulluv8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmull_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vmulluv4hi ((int16x4_t) __a, (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmull_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint64x2_t)__builtin_neon_vmulluv2si ((int32x2_t) __a, (int32x2_t) __b);
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmull_p8 (poly8x8_t __a, poly8x8_t __b)
- {
-   return (poly16x8_t)__builtin_neon_vmullpv8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmull_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int32x4_t)__builtin_neon_vqdmullv4hi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmull_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int64x2_t)__builtin_neon_vqdmullv2si (__a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmla_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
- {
-   return (int8x8_t)__builtin_neon_vmlav8qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmla_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
- {
-   return (int16x4_t)__builtin_neon_vmlav4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmla_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
- {
-   return (int32x2_t)__builtin_neon_vmlav2si (__a, __b, __c);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmla_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
- {
-   return (float32x2_t)__builtin_neon_vmlav2sf (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmla_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
- {
-   return (uint8x8_t)__builtin_neon_vmlav8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmla_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
- {
-   return (uint16x4_t)__builtin_neon_vmlav4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
- {
-   return (uint32x2_t)__builtin_neon_vmlav2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
- {
-   return (int8x16_t)__builtin_neon_vmlav16qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
- {
-   return (int16x8_t)__builtin_neon_vmlav8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
- {
-   return (int32x4_t)__builtin_neon_vmlav4si (__a, __b, __c);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
- {
-   return (float32x4_t)__builtin_neon_vmlav4sf (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
- {
-   return (uint8x16_t)__builtin_neon_vmlav16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
- {
-   return (uint16x8_t)__builtin_neon_vmlav8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
- {
-   return (uint32x4_t)__builtin_neon_vmlav4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlal_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
- {
-   return (int16x8_t)__builtin_neon_vmlalsv8qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
- {
-   return (int32x4_t)__builtin_neon_vmlalsv4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
- {
-   return (int64x2_t)__builtin_neon_vmlalsv2si (__a, __b, __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
- {
-   return (uint16x8_t)__builtin_neon_vmlaluv8qi ((int16x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlal_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
- {
-   return (uint32x4_t)__builtin_neon_vmlaluv4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlal_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
- {
-   return (uint64x2_t)__builtin_neon_vmlaluv2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
- {
-   return (int32x4_t)__builtin_neon_vqdmlalv4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
- {
-   return (int64x2_t)__builtin_neon_vqdmlalv2si (__a, __b, __c);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmls_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
- {
-   return (int8x8_t)__builtin_neon_vmlsv8qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmls_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
- {
-   return (int16x4_t)__builtin_neon_vmlsv4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmls_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
- {
-   return (int32x2_t)__builtin_neon_vmlsv2si (__a, __b, __c);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmls_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
- {
-   return (float32x2_t)__builtin_neon_vmlsv2sf (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmls_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
- {
-   return (uint8x8_t)__builtin_neon_vmlsv8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmls_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
- {
-   return (uint16x4_t)__builtin_neon_vmlsv4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmls_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
- {
-   return (uint32x2_t)__builtin_neon_vmlsv2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
- {
-   return (int8x16_t)__builtin_neon_vmlsv16qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
- {
-   return (int16x8_t)__builtin_neon_vmlsv8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
- {
-   return (int32x4_t)__builtin_neon_vmlsv4si (__a, __b, __c);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
- {
-   return (float32x4_t)__builtin_neon_vmlsv4sf (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
- {
-   return (uint8x16_t)__builtin_neon_vmlsv16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
- {
-   return (uint16x8_t)__builtin_neon_vmlsv8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
- {
-   return (uint32x4_t)__builtin_neon_vmlsv4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsl_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
- {
-   return (int16x8_t)__builtin_neon_vmlslsv8qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
- {
-   return (int32x4_t)__builtin_neon_vmlslsv4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
- {
-   return (int64x2_t)__builtin_neon_vmlslsv2si (__a, __b, __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsl_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
- {
-   return (uint16x8_t)__builtin_neon_vmlsluv8qi ((int16x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsl_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
- {
-   return (uint32x4_t)__builtin_neon_vmlsluv4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsl_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
- {
-   return (uint64x2_t)__builtin_neon_vmlsluv2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
- {
-   return (int32x4_t)__builtin_neon_vqdmlslv4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
- {
-   return (int64x2_t)__builtin_neon_vqdmlslv2si (__a, __b, __c);
-@@ -1532,25 +1700,29 @@ vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=neon-vfpv4")
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vfma_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
- {
-   return (float32x2_t)__builtin_neon_vfmav2sf (__a, __b, __c);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vfmaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
- {
-   return (float32x4_t)__builtin_neon_vfmav4sf (__a, __b, __c);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vfms_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
- {
-   return (float32x2_t)__builtin_neon_vfmsv2sf (__a, __b, __c);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vfmsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
- {
-   return (float32x4_t)__builtin_neon_vfmsv4sf (__a, __b, __c);
-@@ -1558,7 +1730,8 @@ vfmsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
- #pragma GCC pop_options
- 
- #if __ARM_ARCH >= 8
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrndn_f32 (float32x2_t __a)
- {
-   return (float32x2_t)__builtin_neon_vrintnv2sf (__a);
-@@ -1566,7 +1739,8 @@ vrndn_f32 (float32x2_t __a)
- 
- #endif
- #if __ARM_ARCH >= 8
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrndnq_f32 (float32x4_t __a)
- {
-   return (float32x4_t)__builtin_neon_vrintnv4sf (__a);
-@@ -1574,7 +1748,8 @@ vrndnq_f32 (float32x4_t __a)
- 
- #endif
- #if __ARM_ARCH >= 8
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrnda_f32 (float32x2_t __a)
- {
-   return (float32x2_t)__builtin_neon_vrintav2sf (__a);
-@@ -1582,7 +1757,8 @@ vrnda_f32 (float32x2_t __a)
- 
- #endif
- #if __ARM_ARCH >= 8
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrndaq_f32 (float32x4_t __a)
- {
-   return (float32x4_t)__builtin_neon_vrintav4sf (__a);
-@@ -1590,7 +1766,8 @@ vrndaq_f32 (float32x4_t __a)
- 
- #endif
- #if __ARM_ARCH >= 8
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrndp_f32 (float32x2_t __a)
- {
-   return (float32x2_t)__builtin_neon_vrintpv2sf (__a);
-@@ -1598,7 +1775,8 @@ vrndp_f32 (float32x2_t __a)
- 
- #endif
- #if __ARM_ARCH >= 8
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrndpq_f32 (float32x4_t __a)
- {
-   return (float32x4_t)__builtin_neon_vrintpv4sf (__a);
-@@ -1606,7 +1784,8 @@ vrndpq_f32 (float32x4_t __a)
- 
- #endif
- #if __ARM_ARCH >= 8
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrndm_f32 (float32x2_t __a)
- {
-   return (float32x2_t)__builtin_neon_vrintmv2sf (__a);
-@@ -1614,7 +1793,8 @@ vrndm_f32 (float32x2_t __a)
- 
- #endif
- #if __ARM_ARCH >= 8
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrndmq_f32 (float32x4_t __a)
- {
-   return (float32x4_t)__builtin_neon_vrintmv4sf (__a);
-@@ -1623,7 +1803,8 @@ vrndmq_f32 (float32x4_t __a)
- #endif
- 
- #if __ARM_ARCH >= 8
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrndx_f32 (float32x2_t __a)
- {
-   return (float32x2_t)__builtin_neon_vrintxv2sf (__a);
-@@ -1632,7 +1813,8 @@ vrndx_f32 (float32x2_t __a)
- #endif
- 
- #if __ARM_ARCH >= 8
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrndxq_f32 (float32x4_t __a)
- {
-   return (float32x4_t)__builtin_neon_vrintxv4sf (__a);
-@@ -1641,7 +1823,8 @@ vrndxq_f32 (float32x4_t __a)
- #endif
- 
- #if __ARM_ARCH >= 8
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrnd_f32 (float32x2_t __a)
- {
-   return (float32x2_t)__builtin_neon_vrintzv2sf (__a);
-@@ -1649,7 +1832,8 @@ vrnd_f32 (float32x2_t __a)
- 
- #endif
- #if __ARM_ARCH >= 8
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrndq_f32 (float32x4_t __a)
- {
-   return (float32x4_t)__builtin_neon_vrintzv4sf (__a);
-@@ -1657,2907 +1841,3436 @@ vrndq_f32 (float32x4_t __a)
- 
- #endif
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsub_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsub_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsub_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsub_f32 (float32x2_t __a, float32x2_t __b)
- {
--#ifdef __FAST_MATH
-+#ifdef __FAST_MATH__
-   return __a - __b;
- #else
-   return (float32x2_t) __builtin_neon_vsubv2sf (__a, __b);
- #endif
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsub_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsub_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsub_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsub_s64 (int64x1_t __a, int64x1_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsub_u64 (uint64x1_t __a, uint64x1_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubq_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubq_f32 (float32x4_t __a, float32x4_t __b)
- {
--#ifdef __FAST_MATH
-+#ifdef __FAST_MATH__
-   return __a - __b;
- #else
-   return (float32x4_t) __builtin_neon_vsubv4sf (__a, __b);
- #endif
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubq_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return __a - __b;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubl_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int16x8_t)__builtin_neon_vsublsv8qi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubl_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int32x4_t)__builtin_neon_vsublsv4hi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubl_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int64x2_t)__builtin_neon_vsublsv2si (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubl_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vsubluv8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubl_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vsubluv4hi ((int16x4_t) __a, (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubl_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint64x2_t)__builtin_neon_vsubluv2si ((int32x2_t) __a, (int32x2_t) __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubw_s8 (int16x8_t __a, int8x8_t __b)
- {
-   return (int16x8_t)__builtin_neon_vsubwsv8qi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubw_s16 (int32x4_t __a, int16x4_t __b)
- {
-   return (int32x4_t)__builtin_neon_vsubwsv4hi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubw_s32 (int64x2_t __a, int32x2_t __b)
- {
-   return (int64x2_t)__builtin_neon_vsubwsv2si (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubw_u8 (uint16x8_t __a, uint8x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vsubwuv8qi ((int16x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubw_u16 (uint32x4_t __a, uint16x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vsubwuv4hi ((int32x4_t) __a, (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubw_u32 (uint64x2_t __a, uint32x2_t __b)
- {
-   return (uint64x2_t)__builtin_neon_vsubwuv2si ((int64x2_t) __a, (int32x2_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vhsub_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int8x8_t)__builtin_neon_vhsubsv8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vhsub_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x4_t)__builtin_neon_vhsubsv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vhsub_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x2_t)__builtin_neon_vhsubsv2si (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vhsub_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vhsubuv8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vhsub_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vhsubuv4hi ((int16x4_t) __a, (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vhsub_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vhsubuv2si ((int32x2_t) __a, (int32x2_t) __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vhsubq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (int8x16_t)__builtin_neon_vhsubsv16qi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vhsubq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int16x8_t)__builtin_neon_vhsubsv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vhsubq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int32x4_t)__builtin_neon_vhsubsv4si (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vhsubq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vhsubuv16qi ((int8x16_t) __a, (int8x16_t) __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vhsubq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vhsubuv8hi ((int16x8_t) __a, (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vhsubq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vhsubuv4si ((int32x4_t) __a, (int32x4_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqsub_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int8x8_t)__builtin_neon_vqsubsv8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqsub_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x4_t)__builtin_neon_vqsubsv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqsub_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x2_t)__builtin_neon_vqsubsv2si (__a, __b);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqsub_s64 (int64x1_t __a, int64x1_t __b)
- {
-   return (int64x1_t)__builtin_neon_vqsubsdi (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqsub_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vqsubuv8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqsub_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vqsubuv4hi ((int16x4_t) __a, (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqsub_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vqsubuv2si ((int32x2_t) __a, (int32x2_t) __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqsub_u64 (uint64x1_t __a, uint64x1_t __b)
- {
-   return (uint64x1_t)__builtin_neon_vqsubudi ((int64x1_t) __a, (int64x1_t) __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqsubq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (int8x16_t)__builtin_neon_vqsubsv16qi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqsubq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int16x8_t)__builtin_neon_vqsubsv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqsubq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int32x4_t)__builtin_neon_vqsubsv4si (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqsubq_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return (int64x2_t)__builtin_neon_vqsubsv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqsubq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vqsubuv16qi ((int8x16_t) __a, (int8x16_t) __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqsubq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vqsubuv8hi ((int16x8_t) __a, (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqsubq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vqsubuv4si ((int32x4_t) __a, (int32x4_t) __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqsubq_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return (uint64x2_t)__builtin_neon_vqsubuv2di ((int64x2_t) __a, (int64x2_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubhn_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int8x8_t)__builtin_neon_vsubhnv8hi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubhn_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int16x4_t)__builtin_neon_vsubhnv4si (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubhn_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return (int32x2_t)__builtin_neon_vsubhnv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubhn_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vsubhnv8hi ((int16x8_t) __a, (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubhn_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vsubhnv4si ((int32x4_t) __a, (int32x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsubhn_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vsubhnv2di ((int64x2_t) __a, (int64x2_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsubhn_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int8x8_t)__builtin_neon_vrsubhnv8hi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsubhn_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int16x4_t)__builtin_neon_vrsubhnv4si (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsubhn_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return (int32x2_t)__builtin_neon_vrsubhnv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsubhn_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vrsubhnv8hi ((int16x8_t) __a, (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsubhn_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vrsubhnv4si ((int32x4_t) __a, (int32x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsubhn_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vrsubhnv2di ((int64x2_t) __a, (int64x2_t) __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vceq_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vceqv8qi (__a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vceq_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vceqv4hi (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vceq_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vceqv2si (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vceq_f32 (float32x2_t __a, float32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vceqv2sf (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vceq_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vceqv8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vceq_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vceqv4hi ((int16x4_t) __a, (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vceq_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vceqv2si ((int32x2_t) __a, (int32x2_t) __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vceq_p8 (poly8x8_t __a, poly8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vceqv8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vceqq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vceqv16qi (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vceqq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vceqv8hi (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vceqq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vceqv4si (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vceqq_f32 (float32x4_t __a, float32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vceqv4sf (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vceqq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vceqv16qi ((int8x16_t) __a, (int8x16_t) __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vceqq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vceqv8hi ((int16x8_t) __a, (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vceqq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vceqv4si ((int32x4_t) __a, (int32x4_t) __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vceqq_p8 (poly8x16_t __a, poly8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vceqv16qi ((int8x16_t) __a, (int8x16_t) __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcge_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vcgev8qi (__a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcge_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vcgev4hi (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcge_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vcgev2si (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcge_f32 (float32x2_t __a, float32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vcgev2sf (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcge_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vcgeuv8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcge_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vcgeuv4hi ((int16x4_t) __a, (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcge_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vcgeuv2si ((int32x2_t) __a, (int32x2_t) __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcgeq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vcgev16qi (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcgeq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vcgev8hi (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcgeq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vcgev4si (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcgeq_f32 (float32x4_t __a, float32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vcgev4sf (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcgeq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vcgeuv16qi ((int8x16_t) __a, (int8x16_t) __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcgeq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vcgeuv8hi ((int16x8_t) __a, (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcgeq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vcgeuv4si ((int32x4_t) __a, (int32x4_t) __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcle_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vcgev8qi (__b, __a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcle_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vcgev4hi (__b, __a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcle_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vcgev2si (__b, __a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcle_f32 (float32x2_t __a, float32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vcgev2sf (__b, __a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcle_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vcgeuv8qi ((int8x8_t) __b, (int8x8_t) __a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcle_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vcgeuv4hi ((int16x4_t) __b, (int16x4_t) __a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcle_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vcgeuv2si ((int32x2_t) __b, (int32x2_t) __a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcleq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vcgev16qi (__b, __a);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcleq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vcgev8hi (__b, __a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcleq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vcgev4si (__b, __a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcleq_f32 (float32x4_t __a, float32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vcgev4sf (__b, __a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcleq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vcgeuv16qi ((int8x16_t) __b, (int8x16_t) __a);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcleq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vcgeuv8hi ((int16x8_t) __b, (int16x8_t) __a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcleq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vcgeuv4si ((int32x4_t) __b, (int32x4_t) __a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcgt_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vcgtv8qi (__a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcgt_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vcgtv4hi (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcgt_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vcgtv2si (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcgt_f32 (float32x2_t __a, float32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vcgtv2sf (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcgt_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vcgtuv8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcgt_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vcgtuv4hi ((int16x4_t) __a, (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcgt_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vcgtuv2si ((int32x2_t) __a, (int32x2_t) __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcgtq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vcgtv16qi (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcgtq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vcgtv8hi (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcgtq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vcgtv4si (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcgtq_f32 (float32x4_t __a, float32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vcgtv4sf (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcgtq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vcgtuv16qi ((int8x16_t) __a, (int8x16_t) __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcgtq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vcgtuv8hi ((int16x8_t) __a, (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcgtq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vcgtuv4si ((int32x4_t) __a, (int32x4_t) __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vclt_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vcgtv8qi (__b, __a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vclt_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vcgtv4hi (__b, __a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vclt_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vcgtv2si (__b, __a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vclt_f32 (float32x2_t __a, float32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vcgtv2sf (__b, __a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vclt_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vcgtuv8qi ((int8x8_t) __b, (int8x8_t) __a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vclt_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vcgtuv4hi ((int16x4_t) __b, (int16x4_t) __a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vclt_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vcgtuv2si ((int32x2_t) __b, (int32x2_t) __a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcltq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vcgtv16qi (__b, __a);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcltq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vcgtv8hi (__b, __a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcltq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vcgtv4si (__b, __a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcltq_f32 (float32x4_t __a, float32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vcgtv4sf (__b, __a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcltq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vcgtuv16qi ((int8x16_t) __b, (int8x16_t) __a);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcltq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vcgtuv8hi ((int16x8_t) __b, (int16x8_t) __a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcltq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vcgtuv4si ((int32x4_t) __b, (int32x4_t) __a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcage_f32 (float32x2_t __a, float32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vcagev2sf (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcageq_f32 (float32x4_t __a, float32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vcagev4sf (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcale_f32 (float32x2_t __a, float32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vcagev2sf (__b, __a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcaleq_f32 (float32x4_t __a, float32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vcagev4sf (__b, __a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcagt_f32 (float32x2_t __a, float32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vcagtv2sf (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcagtq_f32 (float32x4_t __a, float32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vcagtv4sf (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcalt_f32 (float32x2_t __a, float32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vcagtv2sf (__b, __a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcaltq_f32 (float32x4_t __a, float32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vcagtv4sf (__b, __a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtst_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vtstv8qi (__a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtst_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vtstv4hi (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtst_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vtstv2si (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtst_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vtstv8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtst_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vtstv4hi ((int16x4_t) __a, (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtst_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vtstv2si ((int32x2_t) __a, (int32x2_t) __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtst_p8 (poly8x8_t __a, poly8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vtstv8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vtst_p16 (poly16x4_t __a, poly16x4_t __b)
-+{
-+  return (uint16x4_t)__builtin_neon_vtstv4hi ((int16x4_t) __a, (int16x4_t) __b);
-+}
-+
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtstq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vtstv16qi (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtstq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vtstv8hi (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtstq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vtstv4si (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtstq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vtstv16qi ((int8x16_t) __a, (int8x16_t) __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtstq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vtstv8hi ((int16x8_t) __a, (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtstq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vtstv4si ((int32x4_t) __a, (int32x4_t) __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtstq_p8 (poly8x16_t __a, poly8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vtstv16qi ((int8x16_t) __a, (int8x16_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vtstq_p16 (poly16x8_t __a, poly16x8_t __b)
-+{
-+  return (uint16x8_t)__builtin_neon_vtstv8hi ((int16x8_t) __a, (int16x8_t) __b);
-+}
-+
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabd_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int8x8_t)__builtin_neon_vabdsv8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabd_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x4_t)__builtin_neon_vabdsv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabd_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x2_t)__builtin_neon_vabdsv2si (__a, __b);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabd_f32 (float32x2_t __a, float32x2_t __b)
- {
-   return (float32x2_t)__builtin_neon_vabdfv2sf (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabd_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vabduv8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabd_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vabduv4hi ((int16x4_t) __a, (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabd_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vabduv2si ((int32x2_t) __a, (int32x2_t) __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabdq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (int8x16_t)__builtin_neon_vabdsv16qi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabdq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int16x8_t)__builtin_neon_vabdsv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabdq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int32x4_t)__builtin_neon_vabdsv4si (__a, __b);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabdq_f32 (float32x4_t __a, float32x4_t __b)
- {
-   return (float32x4_t)__builtin_neon_vabdfv4sf (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabdq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vabduv16qi ((int8x16_t) __a, (int8x16_t) __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabdq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vabduv8hi ((int16x8_t) __a, (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabdq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vabduv4si ((int32x4_t) __a, (int32x4_t) __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabdl_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int16x8_t)__builtin_neon_vabdlsv8qi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabdl_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int32x4_t)__builtin_neon_vabdlsv4hi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabdl_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int64x2_t)__builtin_neon_vabdlsv2si (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabdl_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vabdluv8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabdl_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vabdluv4hi ((int16x4_t) __a, (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabdl_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint64x2_t)__builtin_neon_vabdluv2si ((int32x2_t) __a, (int32x2_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaba_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
- {
-   return (int8x8_t)__builtin_neon_vabasv8qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaba_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
- {
-   return (int16x4_t)__builtin_neon_vabasv4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaba_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
- {
-   return (int32x2_t)__builtin_neon_vabasv2si (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaba_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
- {
-   return (uint8x8_t)__builtin_neon_vabauv8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaba_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
- {
-   return (uint16x4_t)__builtin_neon_vabauv4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaba_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
- {
-   return (uint32x2_t)__builtin_neon_vabauv2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
- {
-   return (int8x16_t)__builtin_neon_vabasv16qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
- {
-   return (int16x8_t)__builtin_neon_vabasv8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
- {
-   return (int32x4_t)__builtin_neon_vabasv4si (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
- {
-   return (uint8x16_t)__builtin_neon_vabauv16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
- {
-   return (uint16x8_t)__builtin_neon_vabauv8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
- {
-   return (uint32x4_t)__builtin_neon_vabauv4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabal_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
- {
-   return (int16x8_t)__builtin_neon_vabalsv8qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
- {
-   return (int32x4_t)__builtin_neon_vabalsv4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
- {
-   return (int64x2_t)__builtin_neon_vabalsv2si (__a, __b, __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
- {
-   return (uint16x8_t)__builtin_neon_vabaluv8qi ((int16x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabal_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
- {
-   return (uint32x4_t)__builtin_neon_vabaluv4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabal_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
- {
-   return (uint64x2_t)__builtin_neon_vabaluv2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmax_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int8x8_t)__builtin_neon_vmaxsv8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmax_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x4_t)__builtin_neon_vmaxsv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmax_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x2_t)__builtin_neon_vmaxsv2si (__a, __b);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmax_f32 (float32x2_t __a, float32x2_t __b)
- {
-   return (float32x2_t)__builtin_neon_vmaxfv2sf (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmax_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vmaxuv8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmax_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vmaxuv4hi ((int16x4_t) __a, (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmax_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vmaxuv2si ((int32x2_t) __a, (int32x2_t) __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmaxq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (int8x16_t)__builtin_neon_vmaxsv16qi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmaxq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int16x8_t)__builtin_neon_vmaxsv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmaxq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int32x4_t)__builtin_neon_vmaxsv4si (__a, __b);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmaxq_f32 (float32x4_t __a, float32x4_t __b)
- {
-   return (float32x4_t)__builtin_neon_vmaxfv4sf (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+#pragma GCC push_options
-+#pragma GCC target ("fpu=neon-fp-armv8")
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxnm_f32 (float32x2_t a, float32x2_t b)
-+{
-+  return (float32x2_t)__builtin_neon_vmaxnmv2sf (a, b);
-+}
-+
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxnmq_f32 (float32x4_t a, float32x4_t b)
-+{
-+  return (float32x4_t)__builtin_neon_vmaxnmv4sf (a, b);
-+}
-+
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vminnm_f32 (float32x2_t a, float32x2_t b)
-+{
-+  return (float32x2_t)__builtin_neon_vminnmv2sf (a, b);
-+}
-+
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vminnmq_f32 (float32x4_t a, float32x4_t b)
-+{
-+  return (float32x4_t)__builtin_neon_vminnmv4sf (a, b);
-+}
-+#pragma GCC pop_options
-+
-+
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmaxq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vmaxuv16qi ((int8x16_t) __a, (int8x16_t) __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmaxq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vmaxuv8hi ((int16x8_t) __a, (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmaxq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vmaxuv4si ((int32x4_t) __a, (int32x4_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmin_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int8x8_t)__builtin_neon_vminsv8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmin_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x4_t)__builtin_neon_vminsv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmin_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x2_t)__builtin_neon_vminsv2si (__a, __b);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmin_f32 (float32x2_t __a, float32x2_t __b)
- {
-   return (float32x2_t)__builtin_neon_vminfv2sf (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmin_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vminuv8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmin_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vminuv4hi ((int16x4_t) __a, (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmin_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vminuv2si ((int32x2_t) __a, (int32x2_t) __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vminq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (int8x16_t)__builtin_neon_vminsv16qi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vminq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int16x8_t)__builtin_neon_vminsv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vminq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int32x4_t)__builtin_neon_vminsv4si (__a, __b);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vminq_f32 (float32x4_t __a, float32x4_t __b)
- {
-   return (float32x4_t)__builtin_neon_vminfv4sf (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vminq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vminuv16qi ((int8x16_t) __a, (int8x16_t) __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vminq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vminuv8hi ((int16x8_t) __a, (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vminq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vminuv4si ((int32x4_t) __a, (int32x4_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpadd_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int8x8_t)__builtin_neon_vpaddv8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpadd_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x4_t)__builtin_neon_vpaddv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpadd_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x2_t)__builtin_neon_vpaddv2si (__a, __b);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpadd_f32 (float32x2_t __a, float32x2_t __b)
- {
-   return (float32x2_t)__builtin_neon_vpaddv2sf (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpadd_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vpaddv8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpadd_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vpaddv4hi ((int16x4_t) __a, (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpadd_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vpaddv2si ((int32x2_t) __a, (int32x2_t) __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpaddl_s8 (int8x8_t __a)
- {
-   return (int16x4_t)__builtin_neon_vpaddlsv8qi (__a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpaddl_s16 (int16x4_t __a)
- {
-   return (int32x2_t)__builtin_neon_vpaddlsv4hi (__a);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpaddl_s32 (int32x2_t __a)
- {
-   return (int64x1_t)__builtin_neon_vpaddlsv2si (__a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpaddl_u8 (uint8x8_t __a)
- {
-   return (uint16x4_t)__builtin_neon_vpaddluv8qi ((int8x8_t) __a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpaddl_u16 (uint16x4_t __a)
- {
-   return (uint32x2_t)__builtin_neon_vpaddluv4hi ((int16x4_t) __a);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpaddl_u32 (uint32x2_t __a)
- {
-   return (uint64x1_t)__builtin_neon_vpaddluv2si ((int32x2_t) __a);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpaddlq_s8 (int8x16_t __a)
- {
-   return (int16x8_t)__builtin_neon_vpaddlsv16qi (__a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpaddlq_s16 (int16x8_t __a)
- {
-   return (int32x4_t)__builtin_neon_vpaddlsv8hi (__a);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpaddlq_s32 (int32x4_t __a)
- {
-   return (int64x2_t)__builtin_neon_vpaddlsv4si (__a);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpaddlq_u8 (uint8x16_t __a)
- {
-   return (uint16x8_t)__builtin_neon_vpaddluv16qi ((int8x16_t) __a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpaddlq_u16 (uint16x8_t __a)
- {
-   return (uint32x4_t)__builtin_neon_vpaddluv8hi ((int16x8_t) __a);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpaddlq_u32 (uint32x4_t __a)
- {
-   return (uint64x2_t)__builtin_neon_vpaddluv4si ((int32x4_t) __a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpadal_s8 (int16x4_t __a, int8x8_t __b)
- {
-   return (int16x4_t)__builtin_neon_vpadalsv8qi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpadal_s16 (int32x2_t __a, int16x4_t __b)
- {
-   return (int32x2_t)__builtin_neon_vpadalsv4hi (__a, __b);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpadal_s32 (int64x1_t __a, int32x2_t __b)
- {
-   return (int64x1_t)__builtin_neon_vpadalsv2si (__a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpadal_u8 (uint16x4_t __a, uint8x8_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vpadaluv8qi ((int16x4_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpadal_u16 (uint32x2_t __a, uint16x4_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vpadaluv4hi ((int32x2_t) __a, (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpadal_u32 (uint64x1_t __a, uint32x2_t __b)
- {
-   return (uint64x1_t)__builtin_neon_vpadaluv2si ((int64x1_t) __a, (int32x2_t) __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpadalq_s8 (int16x8_t __a, int8x16_t __b)
- {
-   return (int16x8_t)__builtin_neon_vpadalsv16qi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpadalq_s16 (int32x4_t __a, int16x8_t __b)
- {
-   return (int32x4_t)__builtin_neon_vpadalsv8hi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpadalq_s32 (int64x2_t __a, int32x4_t __b)
- {
-   return (int64x2_t)__builtin_neon_vpadalsv4si (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpadalq_u8 (uint16x8_t __a, uint8x16_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vpadaluv16qi ((int16x8_t) __a, (int8x16_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpadalq_u16 (uint32x4_t __a, uint16x8_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vpadaluv8hi ((int32x4_t) __a, (int16x8_t) __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpadalq_u32 (uint64x2_t __a, uint32x4_t __b)
- {
-   return (uint64x2_t)__builtin_neon_vpadaluv4si ((int64x2_t) __a, (int32x4_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpmax_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int8x8_t)__builtin_neon_vpmaxsv8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpmax_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x4_t)__builtin_neon_vpmaxsv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpmax_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x2_t)__builtin_neon_vpmaxsv2si (__a, __b);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpmax_f32 (float32x2_t __a, float32x2_t __b)
- {
-   return (float32x2_t)__builtin_neon_vpmaxfv2sf (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpmax_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vpmaxuv8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpmax_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vpmaxuv4hi ((int16x4_t) __a, (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpmax_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vpmaxuv2si ((int32x2_t) __a, (int32x2_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpmin_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int8x8_t)__builtin_neon_vpminsv8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpmin_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x4_t)__builtin_neon_vpminsv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpmin_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x2_t)__builtin_neon_vpminsv2si (__a, __b);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpmin_f32 (float32x2_t __a, float32x2_t __b)
- {
-   return (float32x2_t)__builtin_neon_vpminfv2sf (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpmin_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vpminuv8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpmin_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vpminuv4hi ((int16x4_t) __a, (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vpmin_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vpminuv2si ((int32x2_t) __a, (int32x2_t) __b);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrecps_f32 (float32x2_t __a, float32x2_t __b)
- {
-   return (float32x2_t)__builtin_neon_vrecpsv2sf (__a, __b);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrecpsq_f32 (float32x4_t __a, float32x4_t __b)
- {
-   return (float32x4_t)__builtin_neon_vrecpsv4sf (__a, __b);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsqrts_f32 (float32x2_t __a, float32x2_t __b)
- {
-   return (float32x2_t)__builtin_neon_vrsqrtsv2sf (__a, __b);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsqrtsq_f32 (float32x4_t __a, float32x4_t __b)
- {
-   return (float32x4_t)__builtin_neon_vrsqrtsv4sf (__a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshl_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int8x8_t)__builtin_neon_vshlsv8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshl_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x4_t)__builtin_neon_vshlsv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshl_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x2_t)__builtin_neon_vshlsv2si (__a, __b);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshl_s64 (int64x1_t __a, int64x1_t __b)
- {
-   return (int64x1_t)__builtin_neon_vshlsdi (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshl_u8 (uint8x8_t __a, int8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vshluv8qi ((int8x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshl_u16 (uint16x4_t __a, int16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vshluv4hi ((int16x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshl_u32 (uint32x2_t __a, int32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vshluv2si ((int32x2_t) __a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshl_u64 (uint64x1_t __a, int64x1_t __b)
- {
-   return (uint64x1_t)__builtin_neon_vshludi ((int64x1_t) __a, __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshlq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (int8x16_t)__builtin_neon_vshlsv16qi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshlq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int16x8_t)__builtin_neon_vshlsv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshlq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int32x4_t)__builtin_neon_vshlsv4si (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshlq_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return (int64x2_t)__builtin_neon_vshlsv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshlq_u8 (uint8x16_t __a, int8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vshluv16qi ((int8x16_t) __a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshlq_u16 (uint16x8_t __a, int16x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vshluv8hi ((int16x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshlq_u32 (uint32x4_t __a, int32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vshluv4si ((int32x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshlq_u64 (uint64x2_t __a, int64x2_t __b)
- {
-   return (uint64x2_t)__builtin_neon_vshluv2di ((int64x2_t) __a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshl_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int8x8_t)__builtin_neon_vrshlsv8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshl_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x4_t)__builtin_neon_vrshlsv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshl_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x2_t)__builtin_neon_vrshlsv2si (__a, __b);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshl_s64 (int64x1_t __a, int64x1_t __b)
- {
-   return (int64x1_t)__builtin_neon_vrshlsdi (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshl_u8 (uint8x8_t __a, int8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vrshluv8qi ((int8x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshl_u16 (uint16x4_t __a, int16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vrshluv4hi ((int16x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshl_u32 (uint32x2_t __a, int32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vrshluv2si ((int32x2_t) __a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshl_u64 (uint64x1_t __a, int64x1_t __b)
- {
-   return (uint64x1_t)__builtin_neon_vrshludi ((int64x1_t) __a, __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshlq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (int8x16_t)__builtin_neon_vrshlsv16qi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshlq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int16x8_t)__builtin_neon_vrshlsv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshlq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int32x4_t)__builtin_neon_vrshlsv4si (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshlq_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return (int64x2_t)__builtin_neon_vrshlsv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshlq_u8 (uint8x16_t __a, int8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vrshluv16qi ((int8x16_t) __a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshlq_u16 (uint16x8_t __a, int16x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vrshluv8hi ((int16x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshlq_u32 (uint32x4_t __a, int32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vrshluv4si ((int32x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshlq_u64 (uint64x2_t __a, int64x2_t __b)
- {
-   return (uint64x2_t)__builtin_neon_vrshluv2di ((int64x2_t) __a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshl_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int8x8_t)__builtin_neon_vqshlsv8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshl_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x4_t)__builtin_neon_vqshlsv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshl_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x2_t)__builtin_neon_vqshlsv2si (__a, __b);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshl_s64 (int64x1_t __a, int64x1_t __b)
- {
-   return (int64x1_t)__builtin_neon_vqshlsdi (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshl_u8 (uint8x8_t __a, int8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vqshluv8qi ((int8x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshl_u16 (uint16x4_t __a, int16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vqshluv4hi ((int16x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshl_u32 (uint32x2_t __a, int32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vqshluv2si ((int32x2_t) __a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshl_u64 (uint64x1_t __a, int64x1_t __b)
- {
-   return (uint64x1_t)__builtin_neon_vqshludi ((int64x1_t) __a, __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshlq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (int8x16_t)__builtin_neon_vqshlsv16qi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshlq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int16x8_t)__builtin_neon_vqshlsv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshlq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int32x4_t)__builtin_neon_vqshlsv4si (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshlq_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return (int64x2_t)__builtin_neon_vqshlsv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshlq_u8 (uint8x16_t __a, int8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vqshluv16qi ((int8x16_t) __a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshlq_u16 (uint16x8_t __a, int16x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vqshluv8hi ((int16x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshlq_u32 (uint32x4_t __a, int32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vqshluv4si ((int32x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshlq_u64 (uint64x2_t __a, int64x2_t __b)
- {
-   return (uint64x2_t)__builtin_neon_vqshluv2di ((int64x2_t) __a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshl_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int8x8_t)__builtin_neon_vqrshlsv8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshl_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x4_t)__builtin_neon_vqrshlsv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshl_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x2_t)__builtin_neon_vqrshlsv2si (__a, __b);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshl_s64 (int64x1_t __a, int64x1_t __b)
- {
-   return (int64x1_t)__builtin_neon_vqrshlsdi (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshl_u8 (uint8x8_t __a, int8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vqrshluv8qi ((int8x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshl_u16 (uint16x4_t __a, int16x4_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vqrshluv4hi ((int16x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshl_u32 (uint32x2_t __a, int32x2_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vqrshluv2si ((int32x2_t) __a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshl_u64 (uint64x1_t __a, int64x1_t __b)
- {
-   return (uint64x1_t)__builtin_neon_vqrshludi ((int64x1_t) __a, __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshlq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return (int8x16_t)__builtin_neon_vqrshlsv16qi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshlq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return (int16x8_t)__builtin_neon_vqrshlsv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshlq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return (int32x4_t)__builtin_neon_vqrshlsv4si (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshlq_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return (int64x2_t)__builtin_neon_vqrshlsv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshlq_u8 (uint8x16_t __a, int8x16_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vqrshluv16qi ((int8x16_t) __a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshlq_u16 (uint16x8_t __a, int16x8_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vqrshluv8hi ((int16x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshlq_u32 (uint32x4_t __a, int32x4_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vqrshluv4si ((int32x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshlq_u64 (uint64x2_t __a, int64x2_t __b)
- {
-   return (uint64x2_t)__builtin_neon_vqrshluv2di ((int64x2_t) __a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshr_n_s8 (int8x8_t __a, const int __b)
- {
-   return (int8x8_t)__builtin_neon_vshrs_nv8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshr_n_s16 (int16x4_t __a, const int __b)
- {
-   return (int16x4_t)__builtin_neon_vshrs_nv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshr_n_s32 (int32x2_t __a, const int __b)
- {
-   return (int32x2_t)__builtin_neon_vshrs_nv2si (__a, __b);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshr_n_s64 (int64x1_t __a, const int __b)
- {
-   return (int64x1_t)__builtin_neon_vshrs_ndi (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshr_n_u8 (uint8x8_t __a, const int __b)
- {
-   return (uint8x8_t)__builtin_neon_vshru_nv8qi ((int8x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshr_n_u16 (uint16x4_t __a, const int __b)
- {
-   return (uint16x4_t)__builtin_neon_vshru_nv4hi ((int16x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshr_n_u32 (uint32x2_t __a, const int __b)
- {
-   return (uint32x2_t)__builtin_neon_vshru_nv2si ((int32x2_t) __a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshr_n_u64 (uint64x1_t __a, const int __b)
- {
-   return (uint64x1_t)__builtin_neon_vshru_ndi ((int64x1_t) __a, __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshrq_n_s8 (int8x16_t __a, const int __b)
- {
-   return (int8x16_t)__builtin_neon_vshrs_nv16qi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshrq_n_s16 (int16x8_t __a, const int __b)
- {
-   return (int16x8_t)__builtin_neon_vshrs_nv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshrq_n_s32 (int32x4_t __a, const int __b)
- {
-   return (int32x4_t)__builtin_neon_vshrs_nv4si (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshrq_n_s64 (int64x2_t __a, const int __b)
- {
-   return (int64x2_t)__builtin_neon_vshrs_nv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshrq_n_u8 (uint8x16_t __a, const int __b)
- {
-   return (uint8x16_t)__builtin_neon_vshru_nv16qi ((int8x16_t) __a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshrq_n_u16 (uint16x8_t __a, const int __b)
- {
-   return (uint16x8_t)__builtin_neon_vshru_nv8hi ((int16x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshrq_n_u32 (uint32x4_t __a, const int __b)
- {
-   return (uint32x4_t)__builtin_neon_vshru_nv4si ((int32x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshrq_n_u64 (uint64x2_t __a, const int __b)
- {
-   return (uint64x2_t)__builtin_neon_vshru_nv2di ((int64x2_t) __a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshr_n_s8 (int8x8_t __a, const int __b)
- {
-   return (int8x8_t)__builtin_neon_vrshrs_nv8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshr_n_s16 (int16x4_t __a, const int __b)
- {
-   return (int16x4_t)__builtin_neon_vrshrs_nv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshr_n_s32 (int32x2_t __a, const int __b)
- {
-   return (int32x2_t)__builtin_neon_vrshrs_nv2si (__a, __b);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshr_n_s64 (int64x1_t __a, const int __b)
- {
-   return (int64x1_t)__builtin_neon_vrshrs_ndi (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshr_n_u8 (uint8x8_t __a, const int __b)
- {
-   return (uint8x8_t)__builtin_neon_vrshru_nv8qi ((int8x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshr_n_u16 (uint16x4_t __a, const int __b)
- {
-   return (uint16x4_t)__builtin_neon_vrshru_nv4hi ((int16x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshr_n_u32 (uint32x2_t __a, const int __b)
- {
-   return (uint32x2_t)__builtin_neon_vrshru_nv2si ((int32x2_t) __a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshr_n_u64 (uint64x1_t __a, const int __b)
- {
-   return (uint64x1_t)__builtin_neon_vrshru_ndi ((int64x1_t) __a, __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshrq_n_s8 (int8x16_t __a, const int __b)
- {
-   return (int8x16_t)__builtin_neon_vrshrs_nv16qi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshrq_n_s16 (int16x8_t __a, const int __b)
- {
-   return (int16x8_t)__builtin_neon_vrshrs_nv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshrq_n_s32 (int32x4_t __a, const int __b)
- {
-   return (int32x4_t)__builtin_neon_vrshrs_nv4si (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshrq_n_s64 (int64x2_t __a, const int __b)
- {
-   return (int64x2_t)__builtin_neon_vrshrs_nv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshrq_n_u8 (uint8x16_t __a, const int __b)
- {
-   return (uint8x16_t)__builtin_neon_vrshru_nv16qi ((int8x16_t) __a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshrq_n_u16 (uint16x8_t __a, const int __b)
- {
-   return (uint16x8_t)__builtin_neon_vrshru_nv8hi ((int16x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshrq_n_u32 (uint32x4_t __a, const int __b)
- {
-   return (uint32x4_t)__builtin_neon_vrshru_nv4si ((int32x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshrq_n_u64 (uint64x2_t __a, const int __b)
- {
-   return (uint64x2_t)__builtin_neon_vrshru_nv2di ((int64x2_t) __a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshrn_n_s16 (int16x8_t __a, const int __b)
- {
-   return (int8x8_t)__builtin_neon_vshrn_nv8hi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshrn_n_s32 (int32x4_t __a, const int __b)
- {
-   return (int16x4_t)__builtin_neon_vshrn_nv4si (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshrn_n_s64 (int64x2_t __a, const int __b)
- {
-   return (int32x2_t)__builtin_neon_vshrn_nv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshrn_n_u16 (uint16x8_t __a, const int __b)
- {
-   return (uint8x8_t)__builtin_neon_vshrn_nv8hi ((int16x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshrn_n_u32 (uint32x4_t __a, const int __b)
- {
-   return (uint16x4_t)__builtin_neon_vshrn_nv4si ((int32x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshrn_n_u64 (uint64x2_t __a, const int __b)
- {
-   return (uint32x2_t)__builtin_neon_vshrn_nv2di ((int64x2_t) __a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshrn_n_s16 (int16x8_t __a, const int __b)
- {
-   return (int8x8_t)__builtin_neon_vrshrn_nv8hi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshrn_n_s32 (int32x4_t __a, const int __b)
- {
-   return (int16x4_t)__builtin_neon_vrshrn_nv4si (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshrn_n_s64 (int64x2_t __a, const int __b)
- {
-   return (int32x2_t)__builtin_neon_vrshrn_nv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshrn_n_u16 (uint16x8_t __a, const int __b)
- {
-   return (uint8x8_t)__builtin_neon_vrshrn_nv8hi ((int16x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshrn_n_u32 (uint32x4_t __a, const int __b)
- {
-   return (uint16x4_t)__builtin_neon_vrshrn_nv4si ((int32x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrshrn_n_u64 (uint64x2_t __a, const int __b)
- {
-   return (uint32x2_t)__builtin_neon_vrshrn_nv2di ((int64x2_t) __a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshrn_n_s16 (int16x8_t __a, const int __b)
- {
-   return (int8x8_t)__builtin_neon_vqshrns_nv8hi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshrn_n_s32 (int32x4_t __a, const int __b)
- {
-   return (int16x4_t)__builtin_neon_vqshrns_nv4si (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshrn_n_s64 (int64x2_t __a, const int __b)
- {
-   return (int32x2_t)__builtin_neon_vqshrns_nv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshrn_n_u16 (uint16x8_t __a, const int __b)
- {
-   return (uint8x8_t)__builtin_neon_vqshrnu_nv8hi ((int16x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshrn_n_u32 (uint32x4_t __a, const int __b)
- {
-   return (uint16x4_t)__builtin_neon_vqshrnu_nv4si ((int32x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshrn_n_u64 (uint64x2_t __a, const int __b)
- {
-   return (uint32x2_t)__builtin_neon_vqshrnu_nv2di ((int64x2_t) __a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshrn_n_s16 (int16x8_t __a, const int __b)
- {
-   return (int8x8_t)__builtin_neon_vqrshrns_nv8hi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshrn_n_s32 (int32x4_t __a, const int __b)
- {
-   return (int16x4_t)__builtin_neon_vqrshrns_nv4si (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshrn_n_s64 (int64x2_t __a, const int __b)
- {
-   return (int32x2_t)__builtin_neon_vqrshrns_nv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshrn_n_u16 (uint16x8_t __a, const int __b)
- {
-   return (uint8x8_t)__builtin_neon_vqrshrnu_nv8hi ((int16x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshrn_n_u32 (uint32x4_t __a, const int __b)
- {
-   return (uint16x4_t)__builtin_neon_vqrshrnu_nv4si ((int32x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshrn_n_u64 (uint64x2_t __a, const int __b)
- {
-   return (uint32x2_t)__builtin_neon_vqrshrnu_nv2di ((int64x2_t) __a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshrun_n_s16 (int16x8_t __a, const int __b)
- {
-   return (uint8x8_t)__builtin_neon_vqshrun_nv8hi (__a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshrun_n_s32 (int32x4_t __a, const int __b)
- {
-   return (uint16x4_t)__builtin_neon_vqshrun_nv4si (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshrun_n_s64 (int64x2_t __a, const int __b)
- {
-   return (uint32x2_t)__builtin_neon_vqshrun_nv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshrun_n_s16 (int16x8_t __a, const int __b)
- {
-   return (uint8x8_t)__builtin_neon_vqrshrun_nv8hi (__a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshrun_n_s32 (int32x4_t __a, const int __b)
- {
-   return (uint16x4_t)__builtin_neon_vqrshrun_nv4si (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrshrun_n_s64 (int64x2_t __a, const int __b)
- {
-   return (uint32x2_t)__builtin_neon_vqrshrun_nv2di (__a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshl_n_s8 (int8x8_t __a, const int __b)
- {
-   return (int8x8_t)__builtin_neon_vshl_nv8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshl_n_s16 (int16x4_t __a, const int __b)
- {
-   return (int16x4_t)__builtin_neon_vshl_nv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshl_n_s32 (int32x2_t __a, const int __b)
- {
-   return (int32x2_t)__builtin_neon_vshl_nv2si (__a, __b);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshl_n_s64 (int64x1_t __a, const int __b)
- {
-   return (int64x1_t)__builtin_neon_vshl_ndi (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshl_n_u8 (uint8x8_t __a, const int __b)
- {
-   return (uint8x8_t)__builtin_neon_vshl_nv8qi ((int8x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshl_n_u16 (uint16x4_t __a, const int __b)
- {
-   return (uint16x4_t)__builtin_neon_vshl_nv4hi ((int16x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshl_n_u32 (uint32x2_t __a, const int __b)
- {
-   return (uint32x2_t)__builtin_neon_vshl_nv2si ((int32x2_t) __a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshl_n_u64 (uint64x1_t __a, const int __b)
- {
-   return (uint64x1_t)__builtin_neon_vshl_ndi ((int64x1_t) __a, __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshlq_n_s8 (int8x16_t __a, const int __b)
- {
-   return (int8x16_t)__builtin_neon_vshl_nv16qi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshlq_n_s16 (int16x8_t __a, const int __b)
- {
-   return (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshlq_n_s32 (int32x4_t __a, const int __b)
- {
-   return (int32x4_t)__builtin_neon_vshl_nv4si (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshlq_n_s64 (int64x2_t __a, const int __b)
- {
-   return (int64x2_t)__builtin_neon_vshl_nv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshlq_n_u8 (uint8x16_t __a, const int __b)
- {
-   return (uint8x16_t)__builtin_neon_vshl_nv16qi ((int8x16_t) __a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshlq_n_u16 (uint16x8_t __a, const int __b)
- {
-   return (uint16x8_t)__builtin_neon_vshl_nv8hi ((int16x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshlq_n_u32 (uint32x4_t __a, const int __b)
- {
-   return (uint32x4_t)__builtin_neon_vshl_nv4si ((int32x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshlq_n_u64 (uint64x2_t __a, const int __b)
- {
-   return (uint64x2_t)__builtin_neon_vshl_nv2di ((int64x2_t) __a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshl_n_s8 (int8x8_t __a, const int __b)
- {
-   return (int8x8_t)__builtin_neon_vqshl_s_nv8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshl_n_s16 (int16x4_t __a, const int __b)
- {
-   return (int16x4_t)__builtin_neon_vqshl_s_nv4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshl_n_s32 (int32x2_t __a, const int __b)
- {
-   return (int32x2_t)__builtin_neon_vqshl_s_nv2si (__a, __b);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshl_n_s64 (int64x1_t __a, const int __b)
- {
-   return (int64x1_t)__builtin_neon_vqshl_s_ndi (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshl_n_u8 (uint8x8_t __a, const int __b)
- {
-   return (uint8x8_t)__builtin_neon_vqshl_u_nv8qi ((int8x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshl_n_u16 (uint16x4_t __a, const int __b)
- {
-   return (uint16x4_t)__builtin_neon_vqshl_u_nv4hi ((int16x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshl_n_u32 (uint32x2_t __a, const int __b)
- {
-   return (uint32x2_t)__builtin_neon_vqshl_u_nv2si ((int32x2_t) __a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshl_n_u64 (uint64x1_t __a, const int __b)
- {
-   return (uint64x1_t)__builtin_neon_vqshl_u_ndi ((int64x1_t) __a, __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshlq_n_s8 (int8x16_t __a, const int __b)
- {
-   return (int8x16_t)__builtin_neon_vqshl_s_nv16qi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshlq_n_s16 (int16x8_t __a, const int __b)
- {
-   return (int16x8_t)__builtin_neon_vqshl_s_nv8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshlq_n_s32 (int32x4_t __a, const int __b)
- {
-   return (int32x4_t)__builtin_neon_vqshl_s_nv4si (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshlq_n_s64 (int64x2_t __a, const int __b)
- {
-   return (int64x2_t)__builtin_neon_vqshl_s_nv2di (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshlq_n_u8 (uint8x16_t __a, const int __b)
- {
-   return (uint8x16_t)__builtin_neon_vqshl_u_nv16qi ((int8x16_t) __a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshlq_n_u16 (uint16x8_t __a, const int __b)
- {
-   return (uint16x8_t)__builtin_neon_vqshl_u_nv8hi ((int16x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshlq_n_u32 (uint32x4_t __a, const int __b)
- {
-   return (uint32x4_t)__builtin_neon_vqshl_u_nv4si ((int32x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshlq_n_u64 (uint64x2_t __a, const int __b)
- {
-   return (uint64x2_t)__builtin_neon_vqshl_u_nv2di ((int64x2_t) __a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshlu_n_s8 (int8x8_t __a, const int __b)
- {
-   return (uint8x8_t)__builtin_neon_vqshlu_nv8qi (__a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshlu_n_s16 (int16x4_t __a, const int __b)
- {
-   return (uint16x4_t)__builtin_neon_vqshlu_nv4hi (__a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshlu_n_s32 (int32x2_t __a, const int __b)
- {
-   return (uint32x2_t)__builtin_neon_vqshlu_nv2si (__a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshlu_n_s64 (int64x1_t __a, const int __b)
- {
-   return (uint64x1_t)__builtin_neon_vqshlu_ndi (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshluq_n_s8 (int8x16_t __a, const int __b)
- {
-   return (uint8x16_t)__builtin_neon_vqshlu_nv16qi (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshluq_n_s16 (int16x8_t __a, const int __b)
- {
-   return (uint16x8_t)__builtin_neon_vqshlu_nv8hi (__a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshluq_n_s32 (int32x4_t __a, const int __b)
- {
-   return (uint32x4_t)__builtin_neon_vqshlu_nv4si (__a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqshluq_n_s64 (int64x2_t __a, const int __b)
- {
-   return (uint64x2_t)__builtin_neon_vqshlu_nv2di (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshll_n_s8 (int8x8_t __a, const int __b)
- {
-   return (int16x8_t)__builtin_neon_vshlls_nv8qi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshll_n_s16 (int16x4_t __a, const int __b)
- {
-   return (int32x4_t)__builtin_neon_vshlls_nv4hi (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshll_n_s32 (int32x2_t __a, const int __b)
- {
-   return (int64x2_t)__builtin_neon_vshlls_nv2si (__a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshll_n_u8 (uint8x8_t __a, const int __b)
- {
-   return (uint16x8_t)__builtin_neon_vshllu_nv8qi ((int8x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshll_n_u16 (uint16x4_t __a, const int __b)
- {
-   return (uint32x4_t)__builtin_neon_vshllu_nv4hi ((int16x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vshll_n_u32 (uint32x2_t __a, const int __b)
- {
-   return (uint64x2_t)__builtin_neon_vshllu_nv2si ((int32x2_t) __a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
- {
-   return (int8x8_t)__builtin_neon_vsras_nv8qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
- {
-   return (int16x4_t)__builtin_neon_vsras_nv4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
- {
-   return (int32x2_t)__builtin_neon_vsras_nv2si (__a, __b, __c);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
- {
-   return (int64x1_t)__builtin_neon_vsras_ndi (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
- {
-   return (uint8x8_t)__builtin_neon_vsrau_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
- {
-   return (uint16x4_t)__builtin_neon_vsrau_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
- {
-   return (uint32x2_t)__builtin_neon_vsrau_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
- {
-   return (uint64x1_t)__builtin_neon_vsrau_ndi ((int64x1_t) __a, (int64x1_t) __b, __c);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
- {
-   return (int8x16_t)__builtin_neon_vsras_nv16qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
- {
-   return (int16x8_t)__builtin_neon_vsras_nv8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
- {
-   return (int32x4_t)__builtin_neon_vsras_nv4si (__a, __b, __c);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
- {
-   return (int64x2_t)__builtin_neon_vsras_nv2di (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
- {
-   return (uint8x16_t)__builtin_neon_vsrau_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
- {
-   return (uint16x8_t)__builtin_neon_vsrau_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
- {
-   return (uint32x4_t)__builtin_neon_vsrau_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
- {
-   return (uint64x2_t)__builtin_neon_vsrau_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
- {
-   return (int8x8_t)__builtin_neon_vrsras_nv8qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
- {
-   return (int16x4_t)__builtin_neon_vrsras_nv4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
- {
-   return (int32x2_t)__builtin_neon_vrsras_nv2si (__a, __b, __c);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
- {
-   return (int64x1_t)__builtin_neon_vrsras_ndi (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
- {
-   return (uint8x8_t)__builtin_neon_vrsrau_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
- {
-   return (uint16x4_t)__builtin_neon_vrsrau_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
- {
-   return (uint32x2_t)__builtin_neon_vrsrau_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
- {
-   return (uint64x1_t)__builtin_neon_vrsrau_ndi ((int64x1_t) __a, (int64x1_t) __b, __c);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
- {
-   return (int8x16_t)__builtin_neon_vrsras_nv16qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
- {
-   return (int16x8_t)__builtin_neon_vrsras_nv8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
- {
-   return (int32x4_t)__builtin_neon_vrsras_nv4si (__a, __b, __c);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
- {
-   return (int64x2_t)__builtin_neon_vrsras_nv2di (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
- {
-   return (uint8x16_t)__builtin_neon_vrsrau_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
- {
-   return (uint16x8_t)__builtin_neon_vrsrau_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
- {
-   return (uint32x4_t)__builtin_neon_vrsrau_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
- {
-   return (uint64x2_t)__builtin_neon_vrsrau_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
-@@ -4565,68 +5278,79 @@ vrsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsri_n_p64 (poly64x1_t __a, poly64x1_t __b, const int __c)
- {
-   return (poly64x1_t)__builtin_neon_vsri_ndi (__a, __b, __c);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsri_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
- {
-   return (int8x8_t)__builtin_neon_vsri_nv8qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsri_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
- {
-   return (int16x4_t)__builtin_neon_vsri_nv4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsri_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
- {
-   return (int32x2_t)__builtin_neon_vsri_nv2si (__a, __b, __c);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsri_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
- {
-   return (int64x1_t)__builtin_neon_vsri_ndi (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsri_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
- {
-   return (uint8x8_t)__builtin_neon_vsri_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsri_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
- {
-   return (uint16x4_t)__builtin_neon_vsri_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsri_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
- {
-   return (uint32x2_t)__builtin_neon_vsri_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsri_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
- {
-   return (uint64x1_t)__builtin_neon_vsri_ndi ((int64x1_t) __a, (int64x1_t) __b, __c);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsri_n_p8 (poly8x8_t __a, poly8x8_t __b, const int __c)
- {
-   return (poly8x8_t)__builtin_neon_vsri_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsri_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
- {
-   return (poly16x4_t)__builtin_neon_vsri_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
-@@ -4634,68 +5358,79 @@ vsri_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsriq_n_p64 (poly64x2_t __a, poly64x2_t __b, const int __c)
- {
-   return (poly64x2_t)__builtin_neon_vsri_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsriq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
- {
-   return (int8x16_t)__builtin_neon_vsri_nv16qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsriq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
- {
-   return (int16x8_t)__builtin_neon_vsri_nv8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsriq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
- {
-   return (int32x4_t)__builtin_neon_vsri_nv4si (__a, __b, __c);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsriq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
- {
-   return (int64x2_t)__builtin_neon_vsri_nv2di (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsriq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
- {
-   return (uint8x16_t)__builtin_neon_vsri_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsriq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
- {
-   return (uint16x8_t)__builtin_neon_vsri_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsriq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
- {
-   return (uint32x4_t)__builtin_neon_vsri_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsriq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
- {
-   return (uint64x2_t)__builtin_neon_vsri_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsriq_n_p8 (poly8x16_t __a, poly8x16_t __b, const int __c)
- {
-   return (poly8x16_t)__builtin_neon_vsri_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsriq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
- {
-   return (poly16x8_t)__builtin_neon_vsri_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
-@@ -4703,68 +5438,79 @@ vsriq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsli_n_p64 (poly64x1_t __a, poly64x1_t __b, const int __c)
- {
-   return (poly64x1_t)__builtin_neon_vsli_ndi (__a, __b, __c);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsli_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
- {
-   return (int8x8_t)__builtin_neon_vsli_nv8qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsli_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
- {
-   return (int16x4_t)__builtin_neon_vsli_nv4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsli_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
- {
-   return (int32x2_t)__builtin_neon_vsli_nv2si (__a, __b, __c);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsli_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
- {
-   return (int64x1_t)__builtin_neon_vsli_ndi (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsli_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
- {
-   return (uint8x8_t)__builtin_neon_vsli_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsli_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
- {
-   return (uint16x4_t)__builtin_neon_vsli_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsli_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
- {
-   return (uint32x2_t)__builtin_neon_vsli_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsli_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
- {
-   return (uint64x1_t)__builtin_neon_vsli_ndi ((int64x1_t) __a, (int64x1_t) __b, __c);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsli_n_p8 (poly8x8_t __a, poly8x8_t __b, const int __c)
- {
-   return (poly8x8_t)__builtin_neon_vsli_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsli_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
- {
-   return (poly16x4_t)__builtin_neon_vsli_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
-@@ -4772,530 +5518,618 @@ vsli_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsliq_n_p64 (poly64x2_t __a, poly64x2_t __b, const int __c)
- {
-   return (poly64x2_t)__builtin_neon_vsli_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsliq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
- {
-   return (int8x16_t)__builtin_neon_vsli_nv16qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsliq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
- {
-   return (int16x8_t)__builtin_neon_vsli_nv8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsliq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
- {
-   return (int32x4_t)__builtin_neon_vsli_nv4si (__a, __b, __c);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsliq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
- {
-   return (int64x2_t)__builtin_neon_vsli_nv2di (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsliq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
- {
-   return (uint8x16_t)__builtin_neon_vsli_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsliq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
- {
-   return (uint16x8_t)__builtin_neon_vsli_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsliq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
- {
-   return (uint32x4_t)__builtin_neon_vsli_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsliq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
- {
-   return (uint64x2_t)__builtin_neon_vsli_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsliq_n_p8 (poly8x16_t __a, poly8x16_t __b, const int __c)
- {
-   return (poly8x16_t)__builtin_neon_vsli_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsliq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
- {
-   return (poly16x8_t)__builtin_neon_vsli_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabs_s8 (int8x8_t __a)
- {
-   return (int8x8_t)__builtin_neon_vabsv8qi (__a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabs_s16 (int16x4_t __a)
- {
-   return (int16x4_t)__builtin_neon_vabsv4hi (__a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabs_s32 (int32x2_t __a)
- {
-   return (int32x2_t)__builtin_neon_vabsv2si (__a);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabs_f32 (float32x2_t __a)
- {
-   return (float32x2_t)__builtin_neon_vabsv2sf (__a);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabsq_s8 (int8x16_t __a)
- {
-   return (int8x16_t)__builtin_neon_vabsv16qi (__a);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabsq_s16 (int16x8_t __a)
- {
-   return (int16x8_t)__builtin_neon_vabsv8hi (__a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabsq_s32 (int32x4_t __a)
- {
-   return (int32x4_t)__builtin_neon_vabsv4si (__a);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vabsq_f32 (float32x4_t __a)
- {
-   return (float32x4_t)__builtin_neon_vabsv4sf (__a);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqabs_s8 (int8x8_t __a)
- {
-   return (int8x8_t)__builtin_neon_vqabsv8qi (__a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqabs_s16 (int16x4_t __a)
- {
-   return (int16x4_t)__builtin_neon_vqabsv4hi (__a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqabs_s32 (int32x2_t __a)
- {
-   return (int32x2_t)__builtin_neon_vqabsv2si (__a);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqabsq_s8 (int8x16_t __a)
- {
-   return (int8x16_t)__builtin_neon_vqabsv16qi (__a);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqabsq_s16 (int16x8_t __a)
- {
-   return (int16x8_t)__builtin_neon_vqabsv8hi (__a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqabsq_s32 (int32x4_t __a)
- {
-   return (int32x4_t)__builtin_neon_vqabsv4si (__a);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vneg_s8 (int8x8_t __a)
- {
-   return (int8x8_t)__builtin_neon_vnegv8qi (__a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vneg_s16 (int16x4_t __a)
- {
-   return (int16x4_t)__builtin_neon_vnegv4hi (__a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vneg_s32 (int32x2_t __a)
- {
-   return (int32x2_t)__builtin_neon_vnegv2si (__a);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vneg_f32 (float32x2_t __a)
- {
-   return (float32x2_t)__builtin_neon_vnegv2sf (__a);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vnegq_s8 (int8x16_t __a)
- {
-   return (int8x16_t)__builtin_neon_vnegv16qi (__a);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vnegq_s16 (int16x8_t __a)
- {
-   return (int16x8_t)__builtin_neon_vnegv8hi (__a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vnegq_s32 (int32x4_t __a)
- {
-   return (int32x4_t)__builtin_neon_vnegv4si (__a);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vnegq_f32 (float32x4_t __a)
- {
-   return (float32x4_t)__builtin_neon_vnegv4sf (__a);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqneg_s8 (int8x8_t __a)
- {
-   return (int8x8_t)__builtin_neon_vqnegv8qi (__a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqneg_s16 (int16x4_t __a)
- {
-   return (int16x4_t)__builtin_neon_vqnegv4hi (__a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqneg_s32 (int32x2_t __a)
- {
-   return (int32x2_t)__builtin_neon_vqnegv2si (__a);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqnegq_s8 (int8x16_t __a)
- {
-   return (int8x16_t)__builtin_neon_vqnegv16qi (__a);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqnegq_s16 (int16x8_t __a)
- {
-   return (int16x8_t)__builtin_neon_vqnegv8hi (__a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqnegq_s32 (int32x4_t __a)
- {
-   return (int32x4_t)__builtin_neon_vqnegv4si (__a);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmvn_s8 (int8x8_t __a)
- {
-   return (int8x8_t)__builtin_neon_vmvnv8qi (__a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmvn_s16 (int16x4_t __a)
- {
-   return (int16x4_t)__builtin_neon_vmvnv4hi (__a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmvn_s32 (int32x2_t __a)
- {
-   return (int32x2_t)__builtin_neon_vmvnv2si (__a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmvn_u8 (uint8x8_t __a)
- {
-   return (uint8x8_t)__builtin_neon_vmvnv8qi ((int8x8_t) __a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmvn_u16 (uint16x4_t __a)
- {
-   return (uint16x4_t)__builtin_neon_vmvnv4hi ((int16x4_t) __a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmvn_u32 (uint32x2_t __a)
- {
-   return (uint32x2_t)__builtin_neon_vmvnv2si ((int32x2_t) __a);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmvn_p8 (poly8x8_t __a)
- {
-   return (poly8x8_t)__builtin_neon_vmvnv8qi ((int8x8_t) __a);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmvnq_s8 (int8x16_t __a)
- {
-   return (int8x16_t)__builtin_neon_vmvnv16qi (__a);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmvnq_s16 (int16x8_t __a)
- {
-   return (int16x8_t)__builtin_neon_vmvnv8hi (__a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmvnq_s32 (int32x4_t __a)
- {
-   return (int32x4_t)__builtin_neon_vmvnv4si (__a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmvnq_u8 (uint8x16_t __a)
- {
-   return (uint8x16_t)__builtin_neon_vmvnv16qi ((int8x16_t) __a);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmvnq_u16 (uint16x8_t __a)
- {
-   return (uint16x8_t)__builtin_neon_vmvnv8hi ((int16x8_t) __a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmvnq_u32 (uint32x4_t __a)
- {
-   return (uint32x4_t)__builtin_neon_vmvnv4si ((int32x4_t) __a);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmvnq_p8 (poly8x16_t __a)
- {
-   return (poly8x16_t)__builtin_neon_vmvnv16qi ((int8x16_t) __a);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcls_s8 (int8x8_t __a)
- {
-   return (int8x8_t)__builtin_neon_vclsv8qi (__a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcls_s16 (int16x4_t __a)
- {
-   return (int16x4_t)__builtin_neon_vclsv4hi (__a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcls_s32 (int32x2_t __a)
- {
-   return (int32x2_t)__builtin_neon_vclsv2si (__a);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vclsq_s8 (int8x16_t __a)
- {
-   return (int8x16_t)__builtin_neon_vclsv16qi (__a);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vclsq_s16 (int16x8_t __a)
- {
-   return (int16x8_t)__builtin_neon_vclsv8hi (__a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vclsq_s32 (int32x4_t __a)
- {
-   return (int32x4_t)__builtin_neon_vclsv4si (__a);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vclz_s8 (int8x8_t __a)
- {
-   return (int8x8_t)__builtin_neon_vclzv8qi (__a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vclz_s16 (int16x4_t __a)
- {
-   return (int16x4_t)__builtin_neon_vclzv4hi (__a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vclz_s32 (int32x2_t __a)
- {
-   return (int32x2_t)__builtin_neon_vclzv2si (__a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vclz_u8 (uint8x8_t __a)
- {
-   return (uint8x8_t)__builtin_neon_vclzv8qi ((int8x8_t) __a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vclz_u16 (uint16x4_t __a)
- {
-   return (uint16x4_t)__builtin_neon_vclzv4hi ((int16x4_t) __a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vclz_u32 (uint32x2_t __a)
- {
-   return (uint32x2_t)__builtin_neon_vclzv2si ((int32x2_t) __a);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vclzq_s8 (int8x16_t __a)
- {
-   return (int8x16_t)__builtin_neon_vclzv16qi (__a);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vclzq_s16 (int16x8_t __a)
- {
-   return (int16x8_t)__builtin_neon_vclzv8hi (__a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vclzq_s32 (int32x4_t __a)
- {
-   return (int32x4_t)__builtin_neon_vclzv4si (__a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vclzq_u8 (uint8x16_t __a)
- {
-   return (uint8x16_t)__builtin_neon_vclzv16qi ((int8x16_t) __a);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vclzq_u16 (uint16x8_t __a)
- {
-   return (uint16x8_t)__builtin_neon_vclzv8hi ((int16x8_t) __a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vclzq_u32 (uint32x4_t __a)
- {
-   return (uint32x4_t)__builtin_neon_vclzv4si ((int32x4_t) __a);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcnt_s8 (int8x8_t __a)
- {
-   return (int8x8_t)__builtin_neon_vcntv8qi (__a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcnt_u8 (uint8x8_t __a)
- {
-   return (uint8x8_t)__builtin_neon_vcntv8qi ((int8x8_t) __a);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcnt_p8 (poly8x8_t __a)
- {
-   return (poly8x8_t)__builtin_neon_vcntv8qi ((int8x8_t) __a);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcntq_s8 (int8x16_t __a)
- {
-   return (int8x16_t)__builtin_neon_vcntv16qi (__a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcntq_u8 (uint8x16_t __a)
- {
-   return (uint8x16_t)__builtin_neon_vcntv16qi ((int8x16_t) __a);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcntq_p8 (poly8x16_t __a)
- {
-   return (poly8x16_t)__builtin_neon_vcntv16qi ((int8x16_t) __a);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrecpe_f32 (float32x2_t __a)
- {
-   return (float32x2_t)__builtin_neon_vrecpev2sf (__a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrecpe_u32 (uint32x2_t __a)
- {
-   return (uint32x2_t)__builtin_neon_vrecpev2si ((int32x2_t) __a);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrecpeq_f32 (float32x4_t __a)
- {
-   return (float32x4_t)__builtin_neon_vrecpev4sf (__a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrecpeq_u32 (uint32x4_t __a)
- {
-   return (uint32x4_t)__builtin_neon_vrecpev4si ((int32x4_t) __a);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsqrte_f32 (float32x2_t __a)
- {
-   return (float32x2_t)__builtin_neon_vrsqrtev2sf (__a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsqrte_u32 (uint32x2_t __a)
- {
-   return (uint32x2_t)__builtin_neon_vrsqrtev2si ((int32x2_t) __a);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsqrteq_f32 (float32x4_t __a)
- {
-   return (float32x4_t)__builtin_neon_vrsqrtev4sf (__a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrsqrteq_u32 (uint32x4_t __a)
- {
-   return (uint32x4_t)__builtin_neon_vrsqrtev4si ((int32x4_t) __a);
- }
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_lane_s8 (int8x8_t __a, const int __b)
- {
-   return (int8_t)__builtin_neon_vget_lanev8qi (__a, __b);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_lane_s16 (int16x4_t __a, const int __b)
- {
-   return (int16_t)__builtin_neon_vget_lanev4hi (__a, __b);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_lane_s32 (int32x2_t __a, const int __b)
- {
-   return (int32_t)__builtin_neon_vget_lanev2si (__a, __b);
-@@ -5328,67 +6162,88 @@ vget_lane_s32 (int32x2_t __a, const int __b)
-   })
- #endif
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_lane_f32 (float32x2_t __a, const int __b)
- {
-   return (float32_t)__builtin_neon_vget_lanev2sf (__a, __b);
- }
- 
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_lane_u8 (uint8x8_t __a, const int __b)
- {
-   return (uint8_t)__builtin_neon_vget_laneuv8qi ((int8x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_lane_u16 (uint16x4_t __a, const int __b)
- {
-   return (uint16_t)__builtin_neon_vget_laneuv4hi ((int16x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_lane_u32 (uint32x2_t __a, const int __b)
- {
-   return (uint32_t)__builtin_neon_vget_laneuv2si ((int32x2_t) __a, __b);
- }
- 
--__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_lane_p8 (poly8x8_t __a, const int __b)
- {
-   return (poly8_t)__builtin_neon_vget_laneuv8qi ((int8x8_t) __a, __b);
- }
- 
--__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_lane_p16 (poly16x4_t __a, const int __b)
- {
-   return (poly16_t)__builtin_neon_vget_laneuv4hi ((int16x4_t) __a, __b);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_lane_s64 (int64x1_t __a, const int __b)
- {
-   return (int64_t)__builtin_neon_vget_lanedi (__a, __b);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-+#pragma GCC push_options
-+#pragma GCC target ("fpu=crypto-neon-fp-armv8")
-+__extension__ extern __inline poly64_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vget_lane_p64 (poly64x1_t __a, const int __b)
-+{
-+  return (poly64_t)__builtin_neon_vget_lanedi ((int64x1_t) __a, __b);
-+}
-+
-+#pragma GCC pop_options
-+__extension__ extern __inline uint64_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_lane_u64 (uint64x1_t __a, const int __b)
- {
-   return (uint64_t)__builtin_neon_vget_lanedi ((int64x1_t) __a, __b);
- }
- 
--__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vgetq_lane_s8 (int8x16_t __a, const int __b)
- {
-   return (int8_t)__builtin_neon_vget_lanev16qi (__a, __b);
- }
- 
--__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vgetq_lane_s16 (int16x8_t __a, const int __b)
- {
-   return (int16_t)__builtin_neon_vget_lanev8hi (__a, __b);
- }
- 
--__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vgetq_lane_s32 (int32x4_t __a, const int __b)
- {
-   return (int32_t)__builtin_neon_vget_lanev4si (__a, __b);
-@@ -5405,67 +6260,78 @@ vgetq_lane_s32 (int32x4_t __a, const int __b)
-   })
- #endif
- 
--__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vgetq_lane_f32 (float32x4_t __a, const int __b)
- {
-   return (float32_t)__builtin_neon_vget_lanev4sf (__a, __b);
- }
- 
--__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vgetq_lane_u8 (uint8x16_t __a, const int __b)
- {
-   return (uint8_t)__builtin_neon_vget_laneuv16qi ((int8x16_t) __a, __b);
- }
- 
--__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vgetq_lane_u16 (uint16x8_t __a, const int __b)
- {
-   return (uint16_t)__builtin_neon_vget_laneuv8hi ((int16x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vgetq_lane_u32 (uint32x4_t __a, const int __b)
- {
-   return (uint32_t)__builtin_neon_vget_laneuv4si ((int32x4_t) __a, __b);
- }
- 
--__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vgetq_lane_p8 (poly8x16_t __a, const int __b)
- {
-   return (poly8_t)__builtin_neon_vget_laneuv16qi ((int8x16_t) __a, __b);
- }
- 
--__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vgetq_lane_p16 (poly16x8_t __a, const int __b)
- {
-   return (poly16_t)__builtin_neon_vget_laneuv8hi ((int16x8_t) __a, __b);
- }
- 
--__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vgetq_lane_s64 (int64x2_t __a, const int __b)
- {
-   return (int64_t)__builtin_neon_vget_lanev2di (__a, __b);
- }
- 
--__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vgetq_lane_u64 (uint64x2_t __a, const int __b)
- {
-   return (uint64_t)__builtin_neon_vget_lanev2di ((int64x2_t) __a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vset_lane_s8 (int8_t __a, int8x8_t __b, const int __c)
- {
-   return (int8x8_t)__builtin_neon_vset_lanev8qi ((__builtin_neon_qi) __a, __b, __c);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vset_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
- {
-   return (int16x4_t)__builtin_neon_vset_lanev4hi ((__builtin_neon_hi) __a, __b, __c);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vset_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
- {
-   return (int32x2_t)__builtin_neon_vset_lanev2si ((__builtin_neon_si) __a, __b, __c);
-@@ -5483,67 +6349,78 @@ vset_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
-   })
- #endif
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vset_lane_f32 (float32_t __a, float32x2_t __b, const int __c)
- {
-   return (float32x2_t)__builtin_neon_vset_lanev2sf ((__builtin_neon_sf) __a, __b, __c);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vset_lane_u8 (uint8_t __a, uint8x8_t __b, const int __c)
- {
-   return (uint8x8_t)__builtin_neon_vset_lanev8qi ((__builtin_neon_qi) __a, (int8x8_t) __b, __c);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vset_lane_u16 (uint16_t __a, uint16x4_t __b, const int __c)
- {
-   return (uint16x4_t)__builtin_neon_vset_lanev4hi ((__builtin_neon_hi) __a, (int16x4_t) __b, __c);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vset_lane_u32 (uint32_t __a, uint32x2_t __b, const int __c)
- {
-   return (uint32x2_t)__builtin_neon_vset_lanev2si ((__builtin_neon_si) __a, (int32x2_t) __b, __c);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vset_lane_p8 (poly8_t __a, poly8x8_t __b, const int __c)
- {
-   return (poly8x8_t)__builtin_neon_vset_lanev8qi ((__builtin_neon_qi) __a, (int8x8_t) __b, __c);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vset_lane_p16 (poly16_t __a, poly16x4_t __b, const int __c)
- {
-   return (poly16x4_t)__builtin_neon_vset_lanev4hi ((__builtin_neon_hi) __a, (int16x4_t) __b, __c);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vset_lane_s64 (int64_t __a, int64x1_t __b, const int __c)
- {
-   return (int64x1_t)__builtin_neon_vset_lanedi ((__builtin_neon_di) __a, __b, __c);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vset_lane_u64 (uint64_t __a, uint64x1_t __b, const int __c)
- {
-   return (uint64x1_t)__builtin_neon_vset_lanedi ((__builtin_neon_di) __a, (int64x1_t) __b, __c);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsetq_lane_s8 (int8_t __a, int8x16_t __b, const int __c)
- {
-   return (int8x16_t)__builtin_neon_vset_lanev16qi ((__builtin_neon_qi) __a, __b, __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsetq_lane_s16 (int16_t __a, int16x8_t __b, const int __c)
- {
-   return (int16x8_t)__builtin_neon_vset_lanev8hi ((__builtin_neon_hi) __a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsetq_lane_s32 (int32_t __a, int32x4_t __b, const int __c)
- {
-   return (int32x4_t)__builtin_neon_vset_lanev4si ((__builtin_neon_si) __a, __b, __c);
-@@ -5561,49 +6438,57 @@ vsetq_lane_s32 (int32_t __a, int32x4_t __b, const int __c)
-   })
- #endif
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsetq_lane_f32 (float32_t __a, float32x4_t __b, const int __c)
- {
-   return (float32x4_t)__builtin_neon_vset_lanev4sf ((__builtin_neon_sf) __a, __b, __c);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsetq_lane_u8 (uint8_t __a, uint8x16_t __b, const int __c)
- {
-   return (uint8x16_t)__builtin_neon_vset_lanev16qi ((__builtin_neon_qi) __a, (int8x16_t) __b, __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsetq_lane_u16 (uint16_t __a, uint16x8_t __b, const int __c)
- {
-   return (uint16x8_t)__builtin_neon_vset_lanev8hi ((__builtin_neon_hi) __a, (int16x8_t) __b, __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsetq_lane_u32 (uint32_t __a, uint32x4_t __b, const int __c)
- {
-   return (uint32x4_t)__builtin_neon_vset_lanev4si ((__builtin_neon_si) __a, (int32x4_t) __b, __c);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsetq_lane_p8 (poly8_t __a, poly8x16_t __b, const int __c)
- {
-   return (poly8x16_t)__builtin_neon_vset_lanev16qi ((__builtin_neon_qi) __a, (int8x16_t) __b, __c);
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsetq_lane_p16 (poly16_t __a, poly16x8_t __b, const int __c)
- {
-   return (poly16x8_t)__builtin_neon_vset_lanev8hi ((__builtin_neon_hi) __a, (int16x8_t) __b, __c);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsetq_lane_s64 (int64_t __a, int64x2_t __b, const int __c)
- {
-   return (int64x2_t)__builtin_neon_vset_lanev2di ((__builtin_neon_di) __a, __b, __c);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsetq_lane_u64 (uint64_t __a, uint64x2_t __b, const int __c)
- {
-   return (uint64x2_t)__builtin_neon_vset_lanev2di ((__builtin_neon_di) __a, (int64x2_t) __b, __c);
-@@ -5611,136 +6496,158 @@ vsetq_lane_u64 (uint64_t __a, uint64x2_t __b, const int __c)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_p64 (uint64_t __a)
- {
-   return (poly64x1_t)__builtin_neon_vcreatedi ((__builtin_neon_di) __a);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_s8 (uint64_t __a)
- {
-   return (int8x8_t)__builtin_neon_vcreatev8qi ((__builtin_neon_di) __a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_s16 (uint64_t __a)
- {
-   return (int16x4_t)__builtin_neon_vcreatev4hi ((__builtin_neon_di) __a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_s32 (uint64_t __a)
- {
-   return (int32x2_t)__builtin_neon_vcreatev2si ((__builtin_neon_di) __a);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_s64 (uint64_t __a)
- {
-   return (int64x1_t)__builtin_neon_vcreatedi ((__builtin_neon_di) __a);
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_f16 (uint64_t __a)
- {
-   return (float16x4_t) __a;
- }
- #endif
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_f32 (uint64_t __a)
- {
-   return (float32x2_t)__builtin_neon_vcreatev2sf ((__builtin_neon_di) __a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_u8 (uint64_t __a)
- {
-   return (uint8x8_t)__builtin_neon_vcreatev8qi ((__builtin_neon_di) __a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_u16 (uint64_t __a)
- {
-   return (uint16x4_t)__builtin_neon_vcreatev4hi ((__builtin_neon_di) __a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_u32 (uint64_t __a)
- {
-   return (uint32x2_t)__builtin_neon_vcreatev2si ((__builtin_neon_di) __a);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_u64 (uint64_t __a)
- {
-   return (uint64x1_t)__builtin_neon_vcreatedi ((__builtin_neon_di) __a);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_p8 (uint64_t __a)
- {
-   return (poly8x8_t)__builtin_neon_vcreatev8qi ((__builtin_neon_di) __a);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcreate_p16 (uint64_t __a)
- {
-   return (poly16x4_t)__builtin_neon_vcreatev4hi ((__builtin_neon_di) __a);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdup_n_s8 (int8_t __a)
- {
-   return (int8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdup_n_s16 (int16_t __a)
- {
-   return (int16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdup_n_s32 (int32_t __a)
- {
-   return (int32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdup_n_f32 (float32_t __a)
- {
-   return (float32x2_t)__builtin_neon_vdup_nv2sf ((__builtin_neon_sf) __a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdup_n_u8 (uint8_t __a)
- {
-   return (uint8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdup_n_u16 (uint16_t __a)
- {
-   return (uint16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdup_n_u32 (uint32_t __a)
- {
-   return (uint32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdup_n_p8 (poly8_t __a)
- {
-   return (poly8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdup_n_p16 (poly16_t __a)
- {
-   return (poly16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
-@@ -5748,20 +6655,23 @@ vdup_n_p16 (poly16_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdup_n_p64 (poly64_t __a)
- {
-   return (poly64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdup_n_s64 (int64_t __a)
- {
-   return (int64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdup_n_u64 (uint64_t __a)
- {
-   return (uint64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
-@@ -5769,260 +6679,303 @@ vdup_n_u64 (uint64_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdupq_n_p64 (poly64_t __a)
- {
-   return (poly64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdupq_n_s8 (int8_t __a)
- {
-   return (int8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdupq_n_s16 (int16_t __a)
- {
-   return (int16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdupq_n_s32 (int32_t __a)
- {
-   return (int32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdupq_n_f32 (float32_t __a)
- {
-   return (float32x4_t)__builtin_neon_vdup_nv4sf ((__builtin_neon_sf) __a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdupq_n_u8 (uint8_t __a)
- {
-   return (uint8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdupq_n_u16 (uint16_t __a)
- {
-   return (uint16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdupq_n_u32 (uint32_t __a)
- {
-   return (uint32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdupq_n_p8 (poly8_t __a)
- {
-   return (poly8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdupq_n_p16 (poly16_t __a)
- {
-   return (poly16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdupq_n_s64 (int64_t __a)
- {
-   return (int64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdupq_n_u64 (uint64_t __a)
- {
-   return (uint64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmov_n_s8 (int8_t __a)
- {
-   return (int8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmov_n_s16 (int16_t __a)
- {
-   return (int16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmov_n_s32 (int32_t __a)
- {
-   return (int32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmov_n_f32 (float32_t __a)
- {
-   return (float32x2_t)__builtin_neon_vdup_nv2sf ((__builtin_neon_sf) __a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmov_n_u8 (uint8_t __a)
- {
-   return (uint8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmov_n_u16 (uint16_t __a)
- {
-   return (uint16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmov_n_u32 (uint32_t __a)
- {
-   return (uint32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmov_n_p8 (poly8_t __a)
- {
-   return (poly8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmov_n_p16 (poly16_t __a)
- {
-   return (poly16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmov_n_s64 (int64_t __a)
- {
-   return (int64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmov_n_u64 (uint64_t __a)
- {
-   return (uint64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmovq_n_s8 (int8_t __a)
- {
-   return (int8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmovq_n_s16 (int16_t __a)
- {
-   return (int16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmovq_n_s32 (int32_t __a)
- {
-   return (int32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmovq_n_f32 (float32_t __a)
- {
-   return (float32x4_t)__builtin_neon_vdup_nv4sf ((__builtin_neon_sf) __a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmovq_n_u8 (uint8_t __a)
- {
-   return (uint8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmovq_n_u16 (uint16_t __a)
- {
-   return (uint16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmovq_n_u32 (uint32_t __a)
- {
-   return (uint32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmovq_n_p8 (poly8_t __a)
- {
-   return (poly8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmovq_n_p16 (poly16_t __a)
- {
-   return (poly16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmovq_n_s64 (int64_t __a)
- {
-   return (int64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmovq_n_u64 (uint64_t __a)
- {
-   return (uint64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdup_lane_s8 (int8x8_t __a, const int __b)
- {
-   return (int8x8_t)__builtin_neon_vdup_lanev8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdup_lane_s16 (int16x4_t __a, const int __b)
- {
-   return (int16x4_t)__builtin_neon_vdup_lanev4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdup_lane_s32 (int32x2_t __a, const int __b)
- {
-   return (int32x2_t)__builtin_neon_vdup_lanev2si (__a, __b);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdup_lane_f32 (float32x2_t __a, const int __b)
- {
-   return (float32x2_t)__builtin_neon_vdup_lanev2sf (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdup_lane_u8 (uint8x8_t __a, const int __b)
- {
-   return (uint8x8_t)__builtin_neon_vdup_lanev8qi ((int8x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdup_lane_u16 (uint16x4_t __a, const int __b)
- {
-   return (uint16x4_t)__builtin_neon_vdup_lanev4hi ((int16x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdup_lane_u32 (uint32x2_t __a, const int __b)
- {
-   return (uint32x2_t)__builtin_neon_vdup_lanev2si ((int32x2_t) __a, __b);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdup_lane_p8 (poly8x8_t __a, const int __b)
- {
-   return (poly8x8_t)__builtin_neon_vdup_lanev8qi ((int8x8_t) __a, __b);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdup_lane_p16 (poly16x4_t __a, const int __b)
- {
-   return (poly16x4_t)__builtin_neon_vdup_lanev4hi ((int16x4_t) __a, __b);
-@@ -6030,74 +6983,86 @@ vdup_lane_p16 (poly16x4_t __a, const int __b)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdup_lane_p64 (poly64x1_t __a, const int __b)
- {
-   return (poly64x1_t)__builtin_neon_vdup_lanedi (__a, __b);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdup_lane_s64 (int64x1_t __a, const int __b)
- {
-   return (int64x1_t)__builtin_neon_vdup_lanedi (__a, __b);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdup_lane_u64 (uint64x1_t __a, const int __b)
- {
-   return (uint64x1_t)__builtin_neon_vdup_lanedi ((int64x1_t) __a, __b);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdupq_lane_s8 (int8x8_t __a, const int __b)
- {
-   return (int8x16_t)__builtin_neon_vdup_lanev16qi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdupq_lane_s16 (int16x4_t __a, const int __b)
- {
-   return (int16x8_t)__builtin_neon_vdup_lanev8hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdupq_lane_s32 (int32x2_t __a, const int __b)
- {
-   return (int32x4_t)__builtin_neon_vdup_lanev4si (__a, __b);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdupq_lane_f32 (float32x2_t __a, const int __b)
- {
-   return (float32x4_t)__builtin_neon_vdup_lanev4sf (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdupq_lane_u8 (uint8x8_t __a, const int __b)
- {
-   return (uint8x16_t)__builtin_neon_vdup_lanev16qi ((int8x8_t) __a, __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdupq_lane_u16 (uint16x4_t __a, const int __b)
- {
-   return (uint16x8_t)__builtin_neon_vdup_lanev8hi ((int16x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdupq_lane_u32 (uint32x2_t __a, const int __b)
- {
-   return (uint32x4_t)__builtin_neon_vdup_lanev4si ((int32x2_t) __a, __b);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdupq_lane_p8 (poly8x8_t __a, const int __b)
- {
-   return (poly8x16_t)__builtin_neon_vdup_lanev16qi ((int8x8_t) __a, __b);
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdupq_lane_p16 (poly16x4_t __a, const int __b)
- {
-   return (poly16x8_t)__builtin_neon_vdup_lanev8hi ((int16x4_t) __a, __b);
-@@ -6105,20 +7070,23 @@ vdupq_lane_p16 (poly16x4_t __a, const int __b)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdupq_lane_p64 (poly64x1_t __a, const int __b)
- {
-   return (poly64x2_t)__builtin_neon_vdup_lanev2di (__a, __b);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdupq_lane_s64 (int64x1_t __a, const int __b)
- {
-   return (int64x2_t)__builtin_neon_vdup_lanev2di (__a, __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vdupq_lane_u64 (uint64x1_t __a, const int __b)
- {
-   return (uint64x2_t)__builtin_neon_vdup_lanev2di ((int64x1_t) __a, __b);
-@@ -6126,82 +7094,95 @@ vdupq_lane_u64 (uint64x1_t __a, const int __b)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_p64 (poly64x1_t __a, poly64x1_t __b)
- {
-   return (poly64x2_t)__builtin_neon_vcombinedi (__a, __b);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int8x16_t)__builtin_neon_vcombinev8qi (__a, __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return (int16x8_t)__builtin_neon_vcombinev4hi (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return (int32x4_t)__builtin_neon_vcombinev2si (__a, __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_s64 (int64x1_t __a, int64x1_t __b)
- {
-   return (int64x2_t)__builtin_neon_vcombinedi (__a, __b);
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_f16 (float16x4_t __a, float16x4_t __b)
- {
-   return __builtin_neon_vcombinev4hf (__a, __b);
- }
- #endif
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_f32 (float32x2_t __a, float32x2_t __b)
- {
-   return (float32x4_t)__builtin_neon_vcombinev2sf (__a, __b);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint8x16_t)__builtin_neon_vcombinev8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vcombinev4hi ((int16x4_t) __a, (int16x4_t) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vcombinev2si ((int32x2_t) __a, (int32x2_t) __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_u64 (uint64x1_t __a, uint64x1_t __b)
- {
-   return (uint64x2_t)__builtin_neon_vcombinedi ((int64x1_t) __a, (int64x1_t) __b);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_p8 (poly8x8_t __a, poly8x8_t __b)
- {
-   return (poly8x16_t)__builtin_neon_vcombinev8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcombine_p16 (poly16x4_t __a, poly16x4_t __b)
- {
-   return (poly16x8_t)__builtin_neon_vcombinev4hi ((int16x4_t) __a, (int16x4_t) __b);
-@@ -6209,144 +7190,167 @@ vcombine_p16 (poly16x4_t __a, poly16x4_t __b)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_p64 (poly64x2_t __a)
- {
-   return (poly64x1_t)__builtin_neon_vget_highv2di ((int64x2_t) __a);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_s8 (int8x16_t __a)
- {
-   return (int8x8_t)__builtin_neon_vget_highv16qi (__a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_s16 (int16x8_t __a)
- {
-   return (int16x4_t)__builtin_neon_vget_highv8hi (__a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_s32 (int32x4_t __a)
- {
-   return (int32x2_t)__builtin_neon_vget_highv4si (__a);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_s64 (int64x2_t __a)
- {
-   return (int64x1_t)__builtin_neon_vget_highv2di (__a);
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_f16 (float16x8_t __a)
- {
-   return __builtin_neon_vget_highv8hf (__a);
- }
- #endif
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_f32 (float32x4_t __a)
- {
-   return (float32x2_t)__builtin_neon_vget_highv4sf (__a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_u8 (uint8x16_t __a)
- {
-   return (uint8x8_t)__builtin_neon_vget_highv16qi ((int8x16_t) __a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_u16 (uint16x8_t __a)
- {
-   return (uint16x4_t)__builtin_neon_vget_highv8hi ((int16x8_t) __a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_u32 (uint32x4_t __a)
- {
-   return (uint32x2_t)__builtin_neon_vget_highv4si ((int32x4_t) __a);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_u64 (uint64x2_t __a)
- {
-   return (uint64x1_t)__builtin_neon_vget_highv2di ((int64x2_t) __a);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_p8 (poly8x16_t __a)
- {
-   return (poly8x8_t)__builtin_neon_vget_highv16qi ((int8x16_t) __a);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_high_p16 (poly16x8_t __a)
- {
-   return (poly16x4_t)__builtin_neon_vget_highv8hi ((int16x8_t) __a);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_s8 (int8x16_t __a)
- {
-   return (int8x8_t)__builtin_neon_vget_lowv16qi (__a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_s16 (int16x8_t __a)
- {
-   return (int16x4_t)__builtin_neon_vget_lowv8hi (__a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_s32 (int32x4_t __a)
- {
-   return (int32x2_t)__builtin_neon_vget_lowv4si (__a);
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_f16 (float16x8_t __a)
- {
-   return __builtin_neon_vget_lowv8hf (__a);
- }
- #endif
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_f32 (float32x4_t __a)
- {
-   return (float32x2_t)__builtin_neon_vget_lowv4sf (__a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_u8 (uint8x16_t __a)
- {
-   return (uint8x8_t)__builtin_neon_vget_lowv16qi ((int8x16_t) __a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_u16 (uint16x8_t __a)
- {
-   return (uint16x4_t)__builtin_neon_vget_lowv8hi ((int16x8_t) __a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_u32 (uint32x4_t __a)
- {
-   return (uint32x2_t)__builtin_neon_vget_lowv4si ((int32x4_t) __a);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_p8 (poly8x16_t __a)
- {
-   return (poly8x8_t)__builtin_neon_vget_lowv16qi ((int8x16_t) __a);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_p16 (poly16x8_t __a)
- {
-   return (poly16x4_t)__builtin_neon_vget_lowv8hi ((int16x8_t) __a);
-@@ -6354,68 +7358,79 @@ vget_low_p16 (poly16x8_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_p64 (poly64x2_t __a)
- {
-   return (poly64x1_t)__builtin_neon_vget_lowv2di ((int64x2_t) __a);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_s64 (int64x2_t __a)
- {
-   return (int64x1_t)__builtin_neon_vget_lowv2di (__a);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vget_low_u64 (uint64x2_t __a)
- {
-   return (uint64x1_t)__builtin_neon_vget_lowv2di ((int64x2_t) __a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcvt_s32_f32 (float32x2_t __a)
- {
-   return (int32x2_t)__builtin_neon_vcvtsv2sf (__a);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcvt_f32_s32 (int32x2_t __a)
- {
-   return (float32x2_t)__builtin_neon_vcvtsv2si (__a);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcvt_f32_u32 (uint32x2_t __a)
- {
-   return (float32x2_t)__builtin_neon_vcvtuv2si ((int32x2_t) __a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcvt_u32_f32 (float32x2_t __a)
- {
-   return (uint32x2_t)__builtin_neon_vcvtuv2sf (__a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcvtq_s32_f32 (float32x4_t __a)
- {
-   return (int32x4_t)__builtin_neon_vcvtsv4sf (__a);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcvtq_f32_s32 (int32x4_t __a)
- {
-   return (float32x4_t)__builtin_neon_vcvtsv4si (__a);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcvtq_f32_u32 (uint32x4_t __a)
- {
-   return (float32x4_t)__builtin_neon_vcvtuv4si ((int32x4_t) __a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcvtq_u32_f32 (float32x4_t __a)
- {
-   return (uint32x4_t)__builtin_neon_vcvtuv4sf (__a);
-@@ -6424,7 +7439,8 @@ vcvtq_u32_f32 (float32x4_t __a)
- #pragma GCC push_options
- #pragma GCC target ("fpu=neon-fp16")
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcvt_f16_f32 (float32x4_t __a)
- {
-   return (float16x4_t)__builtin_neon_vcvtv4hfv4sf (__a);
-@@ -6432,7 +7448,8 @@ vcvt_f16_f32 (float32x4_t __a)
- #endif
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcvt_f32_f16 (float16x4_t __a)
- {
-   return (float32x4_t)__builtin_neon_vcvtv4sfv4hf (__a);
-@@ -6440,1059 +7457,1232 @@ vcvt_f32_f16 (float16x4_t __a)
- #endif
- #pragma GCC pop_options
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcvt_n_s32_f32 (float32x2_t __a, const int __b)
- {
-   return (int32x2_t)__builtin_neon_vcvts_nv2sf (__a, __b);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcvt_n_f32_s32 (int32x2_t __a, const int __b)
- {
-   return (float32x2_t)__builtin_neon_vcvts_nv2si (__a, __b);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcvt_n_f32_u32 (uint32x2_t __a, const int __b)
- {
-   return (float32x2_t)__builtin_neon_vcvtu_nv2si ((int32x2_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcvt_n_u32_f32 (float32x2_t __a, const int __b)
- {
-   return (uint32x2_t)__builtin_neon_vcvtu_nv2sf (__a, __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcvtq_n_s32_f32 (float32x4_t __a, const int __b)
- {
-   return (int32x4_t)__builtin_neon_vcvts_nv4sf (__a, __b);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcvtq_n_f32_s32 (int32x4_t __a, const int __b)
- {
-   return (float32x4_t)__builtin_neon_vcvts_nv4si (__a, __b);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcvtq_n_f32_u32 (uint32x4_t __a, const int __b)
- {
-   return (float32x4_t)__builtin_neon_vcvtu_nv4si ((int32x4_t) __a, __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vcvtq_n_u32_f32 (float32x4_t __a, const int __b)
- {
-   return (uint32x4_t)__builtin_neon_vcvtu_nv4sf (__a, __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmovn_s16 (int16x8_t __a)
- {
-   return (int8x8_t)__builtin_neon_vmovnv8hi (__a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmovn_s32 (int32x4_t __a)
- {
-   return (int16x4_t)__builtin_neon_vmovnv4si (__a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmovn_s64 (int64x2_t __a)
- {
-   return (int32x2_t)__builtin_neon_vmovnv2di (__a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmovn_u16 (uint16x8_t __a)
- {
-   return (uint8x8_t)__builtin_neon_vmovnv8hi ((int16x8_t) __a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmovn_u32 (uint32x4_t __a)
- {
-   return (uint16x4_t)__builtin_neon_vmovnv4si ((int32x4_t) __a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmovn_u64 (uint64x2_t __a)
- {
-   return (uint32x2_t)__builtin_neon_vmovnv2di ((int64x2_t) __a);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqmovn_s16 (int16x8_t __a)
- {
-   return (int8x8_t)__builtin_neon_vqmovnsv8hi (__a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqmovn_s32 (int32x4_t __a)
- {
-   return (int16x4_t)__builtin_neon_vqmovnsv4si (__a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqmovn_s64 (int64x2_t __a)
- {
-   return (int32x2_t)__builtin_neon_vqmovnsv2di (__a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqmovn_u16 (uint16x8_t __a)
- {
-   return (uint8x8_t)__builtin_neon_vqmovnuv8hi ((int16x8_t) __a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqmovn_u32 (uint32x4_t __a)
- {
-   return (uint16x4_t)__builtin_neon_vqmovnuv4si ((int32x4_t) __a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqmovn_u64 (uint64x2_t __a)
- {
-   return (uint32x2_t)__builtin_neon_vqmovnuv2di ((int64x2_t) __a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqmovun_s16 (int16x8_t __a)
- {
-   return (uint8x8_t)__builtin_neon_vqmovunv8hi (__a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqmovun_s32 (int32x4_t __a)
- {
-   return (uint16x4_t)__builtin_neon_vqmovunv4si (__a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqmovun_s64 (int64x2_t __a)
- {
-   return (uint32x2_t)__builtin_neon_vqmovunv2di (__a);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmovl_s8 (int8x8_t __a)
- {
-   return (int16x8_t)__builtin_neon_vmovlsv8qi (__a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmovl_s16 (int16x4_t __a)
- {
-   return (int32x4_t)__builtin_neon_vmovlsv4hi (__a);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmovl_s32 (int32x2_t __a)
- {
-   return (int64x2_t)__builtin_neon_vmovlsv2si (__a);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmovl_u8 (uint8x8_t __a)
- {
-   return (uint16x8_t)__builtin_neon_vmovluv8qi ((int8x8_t) __a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmovl_u16 (uint16x4_t __a)
- {
-   return (uint32x4_t)__builtin_neon_vmovluv4hi ((int16x4_t) __a);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmovl_u32 (uint32x2_t __a)
- {
-   return (uint64x2_t)__builtin_neon_vmovluv2si ((int32x2_t) __a);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtbl1_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return (int8x8_t)__builtin_neon_vtbl1v8qi (__a, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtbl1_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return (uint8x8_t)__builtin_neon_vtbl1v8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtbl1_p8 (poly8x8_t __a, uint8x8_t __b)
- {
-   return (poly8x8_t)__builtin_neon_vtbl1v8qi ((int8x8_t) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtbl2_s8 (int8x8x2_t __a, int8x8_t __b)
- {
-   union { int8x8x2_t __i; __builtin_neon_ti __o; } __au = { __a };
-   return (int8x8_t)__builtin_neon_vtbl2v8qi (__au.__o, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtbl2_u8 (uint8x8x2_t __a, uint8x8_t __b)
- {
-   union { uint8x8x2_t __i; __builtin_neon_ti __o; } __au = { __a };
-   return (uint8x8_t)__builtin_neon_vtbl2v8qi (__au.__o, (int8x8_t) __b);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtbl2_p8 (poly8x8x2_t __a, uint8x8_t __b)
- {
-   union { poly8x8x2_t __i; __builtin_neon_ti __o; } __au = { __a };
-   return (poly8x8_t)__builtin_neon_vtbl2v8qi (__au.__o, (int8x8_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtbl3_s8 (int8x8x3_t __a, int8x8_t __b)
- {
-   union { int8x8x3_t __i; __builtin_neon_ei __o; } __au = { __a };
-   return (int8x8_t)__builtin_neon_vtbl3v8qi (__au.__o, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtbl3_u8 (uint8x8x3_t __a, uint8x8_t __b)
- {
-   union { uint8x8x3_t __i; __builtin_neon_ei __o; } __au = { __a };
-   return (uint8x8_t)__builtin_neon_vtbl3v8qi (__au.__o, (int8x8_t) __b);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtbl3_p8 (poly8x8x3_t __a, uint8x8_t __b)
- {
-   union { poly8x8x3_t __i; __builtin_neon_ei __o; } __au = { __a };
-   return (poly8x8_t)__builtin_neon_vtbl3v8qi (__au.__o, (int8x8_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtbl4_s8 (int8x8x4_t __a, int8x8_t __b)
- {
-   union { int8x8x4_t __i; __builtin_neon_oi __o; } __au = { __a };
-   return (int8x8_t)__builtin_neon_vtbl4v8qi (__au.__o, __b);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtbl4_u8 (uint8x8x4_t __a, uint8x8_t __b)
- {
-   union { uint8x8x4_t __i; __builtin_neon_oi __o; } __au = { __a };
-   return (uint8x8_t)__builtin_neon_vtbl4v8qi (__au.__o, (int8x8_t) __b);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtbl4_p8 (poly8x8x4_t __a, uint8x8_t __b)
- {
-   union { poly8x8x4_t __i; __builtin_neon_oi __o; } __au = { __a };
-   return (poly8x8_t)__builtin_neon_vtbl4v8qi (__au.__o, (int8x8_t) __b);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtbx1_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
- {
-   return (int8x8_t)__builtin_neon_vtbx1v8qi (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtbx1_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
- {
-   return (uint8x8_t)__builtin_neon_vtbx1v8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtbx1_p8 (poly8x8_t __a, poly8x8_t __b, uint8x8_t __c)
- {
-   return (poly8x8_t)__builtin_neon_vtbx1v8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtbx2_s8 (int8x8_t __a, int8x8x2_t __b, int8x8_t __c)
- {
-   union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-   return (int8x8_t)__builtin_neon_vtbx2v8qi (__a, __bu.__o, __c);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtbx2_u8 (uint8x8_t __a, uint8x8x2_t __b, uint8x8_t __c)
- {
-   union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-   return (uint8x8_t)__builtin_neon_vtbx2v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtbx2_p8 (poly8x8_t __a, poly8x8x2_t __b, uint8x8_t __c)
- {
-   union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-   return (poly8x8_t)__builtin_neon_vtbx2v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtbx3_s8 (int8x8_t __a, int8x8x3_t __b, int8x8_t __c)
- {
-   union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-   return (int8x8_t)__builtin_neon_vtbx3v8qi (__a, __bu.__o, __c);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtbx3_u8 (uint8x8_t __a, uint8x8x3_t __b, uint8x8_t __c)
- {
-   union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-   return (uint8x8_t)__builtin_neon_vtbx3v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtbx3_p8 (poly8x8_t __a, poly8x8x3_t __b, uint8x8_t __c)
- {
-   union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-   return (poly8x8_t)__builtin_neon_vtbx3v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtbx4_s8 (int8x8_t __a, int8x8x4_t __b, int8x8_t __c)
- {
-   union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   return (int8x8_t)__builtin_neon_vtbx4v8qi (__a, __bu.__o, __c);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtbx4_u8 (uint8x8_t __a, uint8x8x4_t __b, uint8x8_t __c)
- {
-   union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   return (uint8x8_t)__builtin_neon_vtbx4v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtbx4_p8 (poly8x8_t __a, poly8x8x4_t __b, uint8x8_t __c)
- {
-   union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   return (poly8x8_t)__builtin_neon_vtbx4v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
- {
-   return (int16x4_t)__builtin_neon_vmul_lanev4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
- {
-   return (int32x2_t)__builtin_neon_vmul_lanev2si (__a, __b, __c);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_lane_f32 (float32x2_t __a, float32x2_t __b, const int __c)
- {
-   return (float32x2_t)__builtin_neon_vmul_lanev2sf (__a, __b, __c);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
- {
-   return (uint16x4_t)__builtin_neon_vmul_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
- {
-   return (uint32x2_t)__builtin_neon_vmul_lanev2si ((int32x2_t) __a, (int32x2_t) __b, __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
- {
-   return (int16x8_t)__builtin_neon_vmul_lanev8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
- {
-   return (int32x4_t)__builtin_neon_vmul_lanev4si (__a, __b, __c);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_lane_f32 (float32x4_t __a, float32x2_t __b, const int __c)
- {
-   return (float32x4_t)__builtin_neon_vmul_lanev4sf (__a, __b, __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __c)
- {
-   return (uint16x8_t)__builtin_neon_vmul_lanev8hi ((int16x8_t) __a, (int16x4_t) __b, __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __c)
- {
-   return (uint32x4_t)__builtin_neon_vmul_lanev4si ((int32x4_t) __a, (int32x2_t) __b, __c);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmla_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
- {
-   return (int16x4_t)__builtin_neon_vmla_lanev4hi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmla_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
- {
-   return (int32x2_t)__builtin_neon_vmla_lanev2si (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmla_lane_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c, const int __d)
- {
-   return (float32x2_t)__builtin_neon_vmla_lanev2sf (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmla_lane_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)
- {
-   return (uint16x4_t)__builtin_neon_vmla_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmla_lane_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d)
- {
-   return (uint32x2_t)__builtin_neon_vmla_lanev2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlaq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
- {
-   return (int16x8_t)__builtin_neon_vmla_lanev8hi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlaq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
- {
-   return (int32x4_t)__builtin_neon_vmla_lanev4si (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlaq_lane_f32 (float32x4_t __a, float32x4_t __b, float32x2_t __c, const int __d)
- {
-   return (float32x4_t)__builtin_neon_vmla_lanev4sf (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlaq_lane_u16 (uint16x8_t __a, uint16x8_t __b, uint16x4_t __c, const int __d)
- {
-   return (uint16x8_t)__builtin_neon_vmla_lanev8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x4_t) __c, __d);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlaq_lane_u32 (uint32x4_t __a, uint32x4_t __b, uint32x2_t __c, const int __d)
- {
-   return (uint32x4_t)__builtin_neon_vmla_lanev4si ((int32x4_t) __a, (int32x4_t) __b, (int32x2_t) __c, __d);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
- {
-   return (int32x4_t)__builtin_neon_vmlals_lanev4hi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
- {
-   return (int64x2_t)__builtin_neon_vmlals_lanev2si (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlal_lane_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)
- {
-   return (uint32x4_t)__builtin_neon_vmlalu_lanev4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlal_lane_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d)
- {
-   return (uint64x2_t)__builtin_neon_vmlalu_lanev2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
- {
-   return (int32x4_t)__builtin_neon_vqdmlal_lanev4hi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
- {
-   return (int64x2_t)__builtin_neon_vqdmlal_lanev2si (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmls_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
- {
-   return (int16x4_t)__builtin_neon_vmls_lanev4hi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmls_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
- {
-   return (int32x2_t)__builtin_neon_vmls_lanev2si (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmls_lane_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c, const int __d)
- {
-   return (float32x2_t)__builtin_neon_vmls_lanev2sf (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmls_lane_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)
- {
-   return (uint16x4_t)__builtin_neon_vmls_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmls_lane_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d)
- {
-   return (uint32x2_t)__builtin_neon_vmls_lanev2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
- {
-   return (int16x8_t)__builtin_neon_vmls_lanev8hi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
- {
-   return (int32x4_t)__builtin_neon_vmls_lanev4si (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsq_lane_f32 (float32x4_t __a, float32x4_t __b, float32x2_t __c, const int __d)
- {
-   return (float32x4_t)__builtin_neon_vmls_lanev4sf (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsq_lane_u16 (uint16x8_t __a, uint16x8_t __b, uint16x4_t __c, const int __d)
- {
-   return (uint16x8_t)__builtin_neon_vmls_lanev8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x4_t) __c, __d);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsq_lane_u32 (uint32x4_t __a, uint32x4_t __b, uint32x2_t __c, const int __d)
- {
-   return (uint32x4_t)__builtin_neon_vmls_lanev4si ((int32x4_t) __a, (int32x4_t) __b, (int32x2_t) __c, __d);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
- {
-   return (int32x4_t)__builtin_neon_vmlsls_lanev4hi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
- {
-   return (int64x2_t)__builtin_neon_vmlsls_lanev2si (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsl_lane_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)
- {
-   return (uint32x4_t)__builtin_neon_vmlslu_lanev4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsl_lane_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d)
- {
-   return (uint64x2_t)__builtin_neon_vmlslu_lanev2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
- {
-   return (int32x4_t)__builtin_neon_vqdmlsl_lanev4hi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
- {
-   return (int64x2_t)__builtin_neon_vqdmlsl_lanev2si (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
- {
-   return (int32x4_t)__builtin_neon_vmulls_lanev4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
- {
-   return (int64x2_t)__builtin_neon_vmulls_lanev2si (__a, __b, __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmull_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
- {
-   return (uint32x4_t)__builtin_neon_vmullu_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmull_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
- {
-   return (uint64x2_t)__builtin_neon_vmullu_lanev2si ((int32x2_t) __a, (int32x2_t) __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
- {
-   return (int32x4_t)__builtin_neon_vqdmull_lanev4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
- {
-   return (int64x2_t)__builtin_neon_vqdmull_lanev2si (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
- {
-   return (int16x8_t)__builtin_neon_vqdmulh_lanev8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
- {
-   return (int32x4_t)__builtin_neon_vqdmulh_lanev4si (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
- {
-   return (int16x4_t)__builtin_neon_vqdmulh_lanev4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
- {
-   return (int32x2_t)__builtin_neon_vqdmulh_lanev2si (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
- {
-   return (int16x8_t)__builtin_neon_vqrdmulh_lanev8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
- {
-   return (int32x4_t)__builtin_neon_vqrdmulh_lanev4si (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
- {
-   return (int16x4_t)__builtin_neon_vqrdmulh_lanev4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
- {
-   return (int32x2_t)__builtin_neon_vqrdmulh_lanev2si (__a, __b, __c);
- }
- 
- #ifdef __ARM_FEATURE_QRDMX
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmlahq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
- {
-   return (int16x8_t)__builtin_neon_vqrdmlah_lanev8hi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmlahq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
- {
-   return (int32x4_t)__builtin_neon_vqrdmlah_lanev4si (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmlah_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
- {
-   return (int16x4_t)__builtin_neon_vqrdmlah_lanev4hi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmlah_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
- {
-   return (int32x2_t)__builtin_neon_vqrdmlah_lanev2si (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmlshq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
- {
-   return (int16x8_t)__builtin_neon_vqrdmlsh_lanev8hi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmlshq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
- {
-   return (int32x4_t)__builtin_neon_vqrdmlsh_lanev4si (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmlsh_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
- {
-   return (int16x4_t)__builtin_neon_vqrdmlsh_lanev4hi (__a, __b, __c, __d);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmlsh_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
- {
-   return (int32x2_t)__builtin_neon_vqrdmlsh_lanev2si (__a, __b, __c, __d);
- }
- #endif
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_n_s16 (int16x4_t __a, int16_t __b)
- {
-   return (int16x4_t)__builtin_neon_vmul_nv4hi (__a, (__builtin_neon_hi) __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_n_s32 (int32x2_t __a, int32_t __b)
- {
-   return (int32x2_t)__builtin_neon_vmul_nv2si (__a, (__builtin_neon_si) __b);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_n_f32 (float32x2_t __a, float32_t __b)
- {
-   return (float32x2_t)__builtin_neon_vmul_nv2sf (__a, (__builtin_neon_sf) __b);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_n_u16 (uint16x4_t __a, uint16_t __b)
- {
-   return (uint16x4_t)__builtin_neon_vmul_nv4hi ((int16x4_t) __a, (__builtin_neon_hi) __b);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmul_n_u32 (uint32x2_t __a, uint32_t __b)
- {
-   return (uint32x2_t)__builtin_neon_vmul_nv2si ((int32x2_t) __a, (__builtin_neon_si) __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_n_s16 (int16x8_t __a, int16_t __b)
- {
-   return (int16x8_t)__builtin_neon_vmul_nv8hi (__a, (__builtin_neon_hi) __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_n_s32 (int32x4_t __a, int32_t __b)
- {
-   return (int32x4_t)__builtin_neon_vmul_nv4si (__a, (__builtin_neon_si) __b);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_n_f32 (float32x4_t __a, float32_t __b)
- {
-   return (float32x4_t)__builtin_neon_vmul_nv4sf (__a, (__builtin_neon_sf) __b);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
- {
-   return (uint16x8_t)__builtin_neon_vmul_nv8hi ((int16x8_t) __a, (__builtin_neon_hi) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vmul_nv4si ((int32x4_t) __a, (__builtin_neon_si) __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmull_n_s16 (int16x4_t __a, int16_t __b)
- {
-   return (int32x4_t)__builtin_neon_vmulls_nv4hi (__a, (__builtin_neon_hi) __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmull_n_s32 (int32x2_t __a, int32_t __b)
- {
-   return (int64x2_t)__builtin_neon_vmulls_nv2si (__a, (__builtin_neon_si) __b);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmull_n_u16 (uint16x4_t __a, uint16_t __b)
- {
-   return (uint32x4_t)__builtin_neon_vmullu_nv4hi ((int16x4_t) __a, (__builtin_neon_hi) __b);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmull_n_u32 (uint32x2_t __a, uint32_t __b)
- {
-   return (uint64x2_t)__builtin_neon_vmullu_nv2si ((int32x2_t) __a, (__builtin_neon_si) __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmull_n_s16 (int16x4_t __a, int16_t __b)
- {
-   return (int32x4_t)__builtin_neon_vqdmull_nv4hi (__a, (__builtin_neon_hi) __b);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmull_n_s32 (int32x2_t __a, int32_t __b)
- {
-   return (int64x2_t)__builtin_neon_vqdmull_nv2si (__a, (__builtin_neon_si) __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmulhq_n_s16 (int16x8_t __a, int16_t __b)
- {
-   return (int16x8_t)__builtin_neon_vqdmulh_nv8hi (__a, (__builtin_neon_hi) __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmulhq_n_s32 (int32x4_t __a, int32_t __b)
- {
-   return (int32x4_t)__builtin_neon_vqdmulh_nv4si (__a, (__builtin_neon_si) __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmulh_n_s16 (int16x4_t __a, int16_t __b)
- {
-   return (int16x4_t)__builtin_neon_vqdmulh_nv4hi (__a, (__builtin_neon_hi) __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmulh_n_s32 (int32x2_t __a, int32_t __b)
- {
-   return (int32x2_t)__builtin_neon_vqdmulh_nv2si (__a, (__builtin_neon_si) __b);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmulhq_n_s16 (int16x8_t __a, int16_t __b)
- {
-   return (int16x8_t)__builtin_neon_vqrdmulh_nv8hi (__a, (__builtin_neon_hi) __b);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmulhq_n_s32 (int32x4_t __a, int32_t __b)
- {
-   return (int32x4_t)__builtin_neon_vqrdmulh_nv4si (__a, (__builtin_neon_si) __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmulh_n_s16 (int16x4_t __a, int16_t __b)
- {
-   return (int16x4_t)__builtin_neon_vqrdmulh_nv4hi (__a, (__builtin_neon_hi) __b);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqrdmulh_n_s32 (int32x2_t __a, int32_t __b)
- {
-   return (int32x2_t)__builtin_neon_vqrdmulh_nv2si (__a, (__builtin_neon_si) __b);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmla_n_s16 (int16x4_t __a, int16x4_t __b, int16_t __c)
- {
-   return (int16x4_t)__builtin_neon_vmla_nv4hi (__a, __b, (__builtin_neon_hi) __c);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmla_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c)
- {
-   return (int32x2_t)__builtin_neon_vmla_nv2si (__a, __b, (__builtin_neon_si) __c);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmla_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
- {
-   return (float32x2_t)__builtin_neon_vmla_nv2sf (__a, __b, (__builtin_neon_sf) __c);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmla_n_u16 (uint16x4_t __a, uint16x4_t __b, uint16_t __c)
- {
-   return (uint16x4_t)__builtin_neon_vmla_nv4hi ((int16x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmla_n_u32 (uint32x2_t __a, uint32x2_t __b, uint32_t __c)
- {
-   return (uint32x2_t)__builtin_neon_vmla_nv2si ((int32x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlaq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c)
- {
-   return (int16x8_t)__builtin_neon_vmla_nv8hi (__a, __b, (__builtin_neon_hi) __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlaq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c)
- {
-   return (int32x4_t)__builtin_neon_vmla_nv4si (__a, __b, (__builtin_neon_si) __c);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
- {
-   return (float32x4_t)__builtin_neon_vmla_nv4sf (__a, __b, (__builtin_neon_sf) __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlaq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c)
- {
-   return (uint16x8_t)__builtin_neon_vmla_nv8hi ((int16x8_t) __a, (int16x8_t) __b, (__builtin_neon_hi) __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlaq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c)
- {
-   return (uint32x4_t)__builtin_neon_vmla_nv4si ((int32x4_t) __a, (int32x4_t) __b, (__builtin_neon_si) __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
- {
-   return (int32x4_t)__builtin_neon_vmlals_nv4hi (__a, __b, (__builtin_neon_hi) __c);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
- {
-   return (int64x2_t)__builtin_neon_vmlals_nv2si (__a, __b, (__builtin_neon_si) __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlal_n_u16 (uint32x4_t __a, uint16x4_t __b, uint16_t __c)
- {
-   return (uint32x4_t)__builtin_neon_vmlalu_nv4hi ((int32x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlal_n_u32 (uint64x2_t __a, uint32x2_t __b, uint32_t __c)
- {
-   return (uint64x2_t)__builtin_neon_vmlalu_nv2si ((int64x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
- {
-   return (int32x4_t)__builtin_neon_vqdmlal_nv4hi (__a, __b, (__builtin_neon_hi) __c);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
- {
-   return (int64x2_t)__builtin_neon_vqdmlal_nv2si (__a, __b, (__builtin_neon_si) __c);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmls_n_s16 (int16x4_t __a, int16x4_t __b, int16_t __c)
- {
-   return (int16x4_t)__builtin_neon_vmls_nv4hi (__a, __b, (__builtin_neon_hi) __c);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmls_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c)
- {
-   return (int32x2_t)__builtin_neon_vmls_nv2si (__a, __b, (__builtin_neon_si) __c);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmls_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
- {
-   return (float32x2_t)__builtin_neon_vmls_nv2sf (__a, __b, (__builtin_neon_sf) __c);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmls_n_u16 (uint16x4_t __a, uint16x4_t __b, uint16_t __c)
- {
-   return (uint16x4_t)__builtin_neon_vmls_nv4hi ((int16x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmls_n_u32 (uint32x2_t __a, uint32x2_t __b, uint32_t __c)
- {
-   return (uint32x2_t)__builtin_neon_vmls_nv2si ((int32x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c)
- {
-   return (int16x8_t)__builtin_neon_vmls_nv8hi (__a, __b, (__builtin_neon_hi) __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c)
- {
-   return (int32x4_t)__builtin_neon_vmls_nv4si (__a, __b, (__builtin_neon_si) __c);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
- {
-   return (float32x4_t)__builtin_neon_vmls_nv4sf (__a, __b, (__builtin_neon_sf) __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c)
- {
-   return (uint16x8_t)__builtin_neon_vmls_nv8hi ((int16x8_t) __a, (int16x8_t) __b, (__builtin_neon_hi) __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c)
- {
-   return (uint32x4_t)__builtin_neon_vmls_nv4si ((int32x4_t) __a, (int32x4_t) __b, (__builtin_neon_si) __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
- {
-   return (int32x4_t)__builtin_neon_vmlsls_nv4hi (__a, __b, (__builtin_neon_hi) __c);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
- {
-   return (int64x2_t)__builtin_neon_vmlsls_nv2si (__a, __b, (__builtin_neon_si) __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsl_n_u16 (uint32x4_t __a, uint16x4_t __b, uint16_t __c)
- {
-   return (uint32x4_t)__builtin_neon_vmlslu_nv4hi ((int32x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmlsl_n_u32 (uint64x2_t __a, uint32x2_t __b, uint32_t __c)
- {
-   return (uint64x2_t)__builtin_neon_vmlslu_nv2si ((int64x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
- {
-   return (int32x4_t)__builtin_neon_vqdmlsl_nv4hi (__a, __b, (__builtin_neon_hi) __c);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vqdmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
- {
-   return (int64x2_t)__builtin_neon_vqdmlsl_nv2si (__a, __b, (__builtin_neon_si) __c);
-@@ -7500,74 +8690,86 @@ vqdmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vext_p64 (poly64x1_t __a, poly64x1_t __b, const int __c)
- {
-   return (poly64x1_t)__builtin_neon_vextdi (__a, __b, __c);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vext_s8 (int8x8_t __a, int8x8_t __b, const int __c)
- {
-   return (int8x8_t)__builtin_neon_vextv8qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vext_s16 (int16x4_t __a, int16x4_t __b, const int __c)
- {
-   return (int16x4_t)__builtin_neon_vextv4hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vext_s32 (int32x2_t __a, int32x2_t __b, const int __c)
- {
-   return (int32x2_t)__builtin_neon_vextv2si (__a, __b, __c);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vext_s64 (int64x1_t __a, int64x1_t __b, const int __c)
- {
-   return (int64x1_t)__builtin_neon_vextdi (__a, __b, __c);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vext_f32 (float32x2_t __a, float32x2_t __b, const int __c)
- {
-   return (float32x2_t)__builtin_neon_vextv2sf (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vext_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
- {
-   return (uint8x8_t)__builtin_neon_vextv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vext_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
- {
-   return (uint16x4_t)__builtin_neon_vextv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vext_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
- {
-   return (uint32x2_t)__builtin_neon_vextv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vext_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
- {
-   return (uint64x1_t)__builtin_neon_vextdi ((int64x1_t) __a, (int64x1_t) __b, __c);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vext_p8 (poly8x8_t __a, poly8x8_t __b, const int __c)
- {
-   return (poly8x8_t)__builtin_neon_vextv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vext_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
- {
-   return (poly16x4_t)__builtin_neon_vextv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
-@@ -7575,290 +8777,338 @@ vext_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vextq_p64 (poly64x2_t __a, poly64x2_t __b, const int __c)
- {
-   return (poly64x2_t)__builtin_neon_vextv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vextq_s8 (int8x16_t __a, int8x16_t __b, const int __c)
- {
-   return (int8x16_t)__builtin_neon_vextv16qi (__a, __b, __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vextq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
- {
-   return (int16x8_t)__builtin_neon_vextv8hi (__a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vextq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
- {
-   return (int32x4_t)__builtin_neon_vextv4si (__a, __b, __c);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vextq_s64 (int64x2_t __a, int64x2_t __b, const int __c)
- {
-   return (int64x2_t)__builtin_neon_vextv2di (__a, __b, __c);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vextq_f32 (float32x4_t __a, float32x4_t __b, const int __c)
- {
-   return (float32x4_t)__builtin_neon_vextv4sf (__a, __b, __c);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vextq_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
- {
-   return (uint8x16_t)__builtin_neon_vextv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vextq_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
- {
-   return (uint16x8_t)__builtin_neon_vextv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vextq_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
- {
-   return (uint32x4_t)__builtin_neon_vextv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vextq_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
- {
-   return (uint64x2_t)__builtin_neon_vextv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vextq_p8 (poly8x16_t __a, poly8x16_t __b, const int __c)
- {
-   return (poly8x16_t)__builtin_neon_vextv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vextq_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
- {
-   return (poly16x8_t)__builtin_neon_vextv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev64_s8 (int8x8_t __a)
- {
-   return (int8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev64_s16 (int16x4_t __a)
- {
-   return (int16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 });
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev64_s32 (int32x2_t __a)
- {
-   return (int32x2_t) __builtin_shuffle (__a, (uint32x2_t) { 1, 0 });
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev64_f32 (float32x2_t __a)
- {
-   return (float32x2_t) __builtin_shuffle (__a, (uint32x2_t) { 1, 0 });
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev64_u8 (uint8x8_t __a)
- {
-   return (uint8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev64_u16 (uint16x4_t __a)
- {
-   return (uint16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 });
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev64_u32 (uint32x2_t __a)
- {
-   return (uint32x2_t) __builtin_shuffle (__a, (uint32x2_t) { 1, 0 });
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev64_p8 (poly8x8_t __a)
- {
-   return (poly8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev64_p16 (poly16x4_t __a)
- {
-   return (poly16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 });
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev64q_s8 (int8x16_t __a)
- {
-   return (int8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev64q_s16 (int16x8_t __a)
- {
-   return (int16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev64q_s32 (int32x4_t __a)
- {
-   return (int32x4_t) __builtin_shuffle (__a, (uint32x4_t) { 1, 0, 3, 2 });
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev64q_f32 (float32x4_t __a)
- {
-   return (float32x4_t) __builtin_shuffle (__a, (uint32x4_t) { 1, 0, 3, 2 });
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev64q_u8 (uint8x16_t __a)
- {
-   return (uint8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev64q_u16 (uint16x8_t __a)
- {
-   return (uint16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev64q_u32 (uint32x4_t __a)
- {
-   return (uint32x4_t) __builtin_shuffle (__a, (uint32x4_t) { 1, 0, 3, 2 });
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev64q_p8 (poly8x16_t __a)
- {
-   return (poly8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev64q_p16 (poly16x8_t __a)
- {
-   return (poly16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev32_s8 (int8x8_t __a)
- {
-   return (int8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev32_s16 (int16x4_t __a)
- {
-   return (int16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 1, 0, 3, 2 });
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev32_u8 (uint8x8_t __a)
- {
-   return (uint8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev32_u16 (uint16x4_t __a)
- {
-   return (uint16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 1, 0, 3, 2 });
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev32_p8 (poly8x8_t __a)
- {
-   return (poly8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev32_p16 (poly16x4_t __a)
- {
-   return (poly16x4_t) __builtin_shuffle (__a, (uint16x4_t) { 1, 0, 3, 2 });
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev32q_s8 (int8x16_t __a)
- {
-   return (int8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev32q_s16 (int16x8_t __a)
- {
-   return (int16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev32q_u8 (uint8x16_t __a)
- {
-   return (uint8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev32q_u16 (uint16x8_t __a)
- {
-   return (uint16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev32q_p8 (poly8x16_t __a)
- {
-   return (poly8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev32q_p16 (poly16x8_t __a)
- {
-   return (poly16x8_t) __builtin_shuffle (__a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev16_s8 (int8x8_t __a)
- {
-   return (int8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev16_u8 (uint8x8_t __a)
- {
-   return (uint8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev16_p8 (poly8x8_t __a)
- {
-   return (poly8x8_t) __builtin_shuffle (__a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev16q_s8 (int8x16_t __a)
- {
-   return (int8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev16q_u8 (uint8x16_t __a)
- {
-   return (uint8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vrev16q_p8 (poly8x16_t __a)
- {
-   return (poly8x16_t) __builtin_shuffle (__a, (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
-@@ -7866,74 +9116,86 @@ vrev16q_p8 (poly8x16_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbsl_p64 (uint64x1_t __a, poly64x1_t __b, poly64x1_t __c)
- {
-   return (poly64x1_t)__builtin_neon_vbsldi ((int64x1_t) __a, __b, __c);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbsl_s8 (uint8x8_t __a, int8x8_t __b, int8x8_t __c)
- {
-   return (int8x8_t)__builtin_neon_vbslv8qi ((int8x8_t) __a, __b, __c);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbsl_s16 (uint16x4_t __a, int16x4_t __b, int16x4_t __c)
- {
-   return (int16x4_t)__builtin_neon_vbslv4hi ((int16x4_t) __a, __b, __c);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbsl_s32 (uint32x2_t __a, int32x2_t __b, int32x2_t __c)
- {
-   return (int32x2_t)__builtin_neon_vbslv2si ((int32x2_t) __a, __b, __c);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbsl_s64 (uint64x1_t __a, int64x1_t __b, int64x1_t __c)
- {
-   return (int64x1_t)__builtin_neon_vbsldi ((int64x1_t) __a, __b, __c);
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbsl_f32 (uint32x2_t __a, float32x2_t __b, float32x2_t __c)
- {
-   return (float32x2_t)__builtin_neon_vbslv2sf ((int32x2_t) __a, __b, __c);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbsl_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
- {
-   return (uint8x8_t)__builtin_neon_vbslv8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbsl_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
- {
-   return (uint16x4_t)__builtin_neon_vbslv4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbsl_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
- {
-   return (uint32x2_t)__builtin_neon_vbslv2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbsl_u64 (uint64x1_t __a, uint64x1_t __b, uint64x1_t __c)
- {
-   return (uint64x1_t)__builtin_neon_vbsldi ((int64x1_t) __a, (int64x1_t) __b, (int64x1_t) __c);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbsl_p8 (uint8x8_t __a, poly8x8_t __b, poly8x8_t __c)
- {
-   return (poly8x8_t)__builtin_neon_vbslv8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbsl_p16 (uint16x4_t __a, poly16x4_t __b, poly16x4_t __c)
- {
-   return (poly16x4_t)__builtin_neon_vbslv4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
-@@ -7941,74 +9203,86 @@ vbsl_p16 (uint16x4_t __a, poly16x4_t __b, poly16x4_t __c)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbslq_p64 (uint64x2_t __a, poly64x2_t __b, poly64x2_t __c)
- {
-   return (poly64x2_t)__builtin_neon_vbslv2di ((int64x2_t) __a, (int64x2_t) __b, (int64x2_t) __c);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbslq_s8 (uint8x16_t __a, int8x16_t __b, int8x16_t __c)
- {
-   return (int8x16_t)__builtin_neon_vbslv16qi ((int8x16_t) __a, __b, __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbslq_s16 (uint16x8_t __a, int16x8_t __b, int16x8_t __c)
- {
-   return (int16x8_t)__builtin_neon_vbslv8hi ((int16x8_t) __a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbslq_s32 (uint32x4_t __a, int32x4_t __b, int32x4_t __c)
- {
-   return (int32x4_t)__builtin_neon_vbslv4si ((int32x4_t) __a, __b, __c);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbslq_s64 (uint64x2_t __a, int64x2_t __b, int64x2_t __c)
- {
-   return (int64x2_t)__builtin_neon_vbslv2di ((int64x2_t) __a, __b, __c);
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbslq_f32 (uint32x4_t __a, float32x4_t __b, float32x4_t __c)
- {
-   return (float32x4_t)__builtin_neon_vbslv4sf ((int32x4_t) __a, __b, __c);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbslq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
- {
-   return (uint8x16_t)__builtin_neon_vbslv16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbslq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
- {
-   return (uint16x8_t)__builtin_neon_vbslv8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbslq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
- {
-   return (uint32x4_t)__builtin_neon_vbslv4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbslq_u64 (uint64x2_t __a, uint64x2_t __b, uint64x2_t __c)
- {
-   return (uint64x2_t)__builtin_neon_vbslv2di ((int64x2_t) __a, (int64x2_t) __b, (int64x2_t) __c);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbslq_p8 (uint8x16_t __a, poly8x16_t __b, poly8x16_t __c)
- {
-   return (poly8x16_t)__builtin_neon_vbslv16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c);
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbslq_p16 (uint16x8_t __a, poly16x8_t __b, poly16x8_t __c)
- {
-   return (poly16x8_t)__builtin_neon_vbslv8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c);
-@@ -8025,7 +9299,8 @@ vbslq_p16 (uint16x8_t __a, poly16x8_t __b, poly16x8_t __c)
-    vector, and will itself be loaded in reverse order (again, relative to the
-    neon intrinsics view, i.e. that would result from a "vld1" instruction).  */
- 
--__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtrn_s8 (int8x8_t __a, int8x8_t __b)
- {
-   int8x8x2_t __rv;
-@@ -8043,7 +9318,8 @@ vtrn_s8 (int8x8_t __a, int8x8_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtrn_s16 (int16x4_t __a, int16x4_t __b)
- {
-   int16x4x2_t __rv;
-@@ -8057,7 +9333,8 @@ vtrn_s16 (int16x4_t __a, int16x4_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtrn_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   uint8x8x2_t __rv;
-@@ -8075,7 +9352,8 @@ vtrn_u8 (uint8x8_t __a, uint8x8_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtrn_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   uint16x4x2_t __rv;
-@@ -8089,7 +9367,8 @@ vtrn_u16 (uint16x4_t __a, uint16x4_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtrn_p8 (poly8x8_t __a, poly8x8_t __b)
- {
-   poly8x8x2_t __rv;
-@@ -8107,7 +9386,8 @@ vtrn_p8 (poly8x8_t __a, poly8x8_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtrn_p16 (poly16x4_t __a, poly16x4_t __b)
- {
-   poly16x4x2_t __rv;
-@@ -8121,7 +9401,8 @@ vtrn_p16 (poly16x4_t __a, poly16x4_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtrn_s32 (int32x2_t __a, int32x2_t __b)
- {
-   int32x2x2_t __rv;
-@@ -8135,7 +9416,8 @@ vtrn_s32 (int32x2_t __a, int32x2_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtrn_f32 (float32x2_t __a, float32x2_t __b)
- {
-   float32x2x2_t __rv;
-@@ -8149,7 +9431,8 @@ vtrn_f32 (float32x2_t __a, float32x2_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtrn_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   uint32x2x2_t __rv;
-@@ -8163,7 +9446,8 @@ vtrn_u32 (uint32x2_t __a, uint32x2_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtrnq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   int8x16x2_t __rv;
-@@ -8181,7 +9465,8 @@ vtrnq_s8 (int8x16_t __a, int8x16_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtrnq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   int16x8x2_t __rv;
-@@ -8199,7 +9484,8 @@ vtrnq_s16 (int16x8_t __a, int16x8_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtrnq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   int32x4x2_t __rv;
-@@ -8213,7 +9499,8 @@ vtrnq_s32 (int32x4_t __a, int32x4_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtrnq_f32 (float32x4_t __a, float32x4_t __b)
- {
-   float32x4x2_t __rv;
-@@ -8227,7 +9514,8 @@ vtrnq_f32 (float32x4_t __a, float32x4_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtrnq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   uint8x16x2_t __rv;
-@@ -8245,7 +9533,8 @@ vtrnq_u8 (uint8x16_t __a, uint8x16_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtrnq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   uint16x8x2_t __rv;
-@@ -8263,7 +9552,8 @@ vtrnq_u16 (uint16x8_t __a, uint16x8_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtrnq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   uint32x4x2_t __rv;
-@@ -8277,7 +9567,8 @@ vtrnq_u32 (uint32x4_t __a, uint32x4_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtrnq_p8 (poly8x16_t __a, poly8x16_t __b)
- {
-   poly8x16x2_t __rv;
-@@ -8295,7 +9586,8 @@ vtrnq_p8 (poly8x16_t __a, poly8x16_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtrnq_p16 (poly16x8_t __a, poly16x8_t __b)
- {
-   poly16x8x2_t __rv;
-@@ -8313,7 +9605,8 @@ vtrnq_p16 (poly16x8_t __a, poly16x8_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vzip_s8 (int8x8_t __a, int8x8_t __b)
- {
-   int8x8x2_t __rv;
-@@ -8331,7 +9624,8 @@ vzip_s8 (int8x8_t __a, int8x8_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vzip_s16 (int16x4_t __a, int16x4_t __b)
- {
-   int16x4x2_t __rv;
-@@ -8345,7 +9639,8 @@ vzip_s16 (int16x4_t __a, int16x4_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vzip_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   uint8x8x2_t __rv;
-@@ -8363,7 +9658,8 @@ vzip_u8 (uint8x8_t __a, uint8x8_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vzip_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   uint16x4x2_t __rv;
-@@ -8377,7 +9673,8 @@ vzip_u16 (uint16x4_t __a, uint16x4_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vzip_p8 (poly8x8_t __a, poly8x8_t __b)
- {
-   poly8x8x2_t __rv;
-@@ -8395,7 +9692,8 @@ vzip_p8 (poly8x8_t __a, poly8x8_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vzip_p16 (poly16x4_t __a, poly16x4_t __b)
- {
-   poly16x4x2_t __rv;
-@@ -8409,7 +9707,8 @@ vzip_p16 (poly16x4_t __a, poly16x4_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vzip_s32 (int32x2_t __a, int32x2_t __b)
- {
-   int32x2x2_t __rv;
-@@ -8423,7 +9722,8 @@ vzip_s32 (int32x2_t __a, int32x2_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vzip_f32 (float32x2_t __a, float32x2_t __b)
- {
-   float32x2x2_t __rv;
-@@ -8437,7 +9737,8 @@ vzip_f32 (float32x2_t __a, float32x2_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vzip_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   uint32x2x2_t __rv;
-@@ -8451,7 +9752,8 @@ vzip_u32 (uint32x2_t __a, uint32x2_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vzipq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   int8x16x2_t __rv;
-@@ -8469,7 +9771,8 @@ vzipq_s8 (int8x16_t __a, int8x16_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vzipq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   int16x8x2_t __rv;
-@@ -8487,7 +9790,8 @@ vzipq_s16 (int16x8_t __a, int16x8_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vzipq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   int32x4x2_t __rv;
-@@ -8501,7 +9805,8 @@ vzipq_s32 (int32x4_t __a, int32x4_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vzipq_f32 (float32x4_t __a, float32x4_t __b)
- {
-   float32x4x2_t __rv;
-@@ -8515,7 +9820,8 @@ vzipq_f32 (float32x4_t __a, float32x4_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vzipq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   uint8x16x2_t __rv;
-@@ -8533,7 +9839,8 @@ vzipq_u8 (uint8x16_t __a, uint8x16_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vzipq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   uint16x8x2_t __rv;
-@@ -8551,7 +9858,8 @@ vzipq_u16 (uint16x8_t __a, uint16x8_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vzipq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   uint32x4x2_t __rv;
-@@ -8565,7 +9873,8 @@ vzipq_u32 (uint32x4_t __a, uint32x4_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vzipq_p8 (poly8x16_t __a, poly8x16_t __b)
- {
-   poly8x16x2_t __rv;
-@@ -8583,7 +9892,8 @@ vzipq_p8 (poly8x16_t __a, poly8x16_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vzipq_p16 (poly16x8_t __a, poly16x8_t __b)
- {
-   poly16x8x2_t __rv;
-@@ -8601,7 +9911,8 @@ vzipq_p16 (poly16x8_t __a, poly16x8_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vuzp_s8 (int8x8_t __a, int8x8_t __b)
- {
-   int8x8x2_t __rv;
-@@ -8619,7 +9930,8 @@ vuzp_s8 (int8x8_t __a, int8x8_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vuzp_s16 (int16x4_t __a, int16x4_t __b)
- {
-   int16x4x2_t __rv;
-@@ -8633,7 +9945,8 @@ vuzp_s16 (int16x4_t __a, int16x4_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vuzp_s32 (int32x2_t __a, int32x2_t __b)
- {
-   int32x2x2_t __rv;
-@@ -8647,7 +9960,8 @@ vuzp_s32 (int32x2_t __a, int32x2_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vuzp_f32 (float32x2_t __a, float32x2_t __b)
- {
-   float32x2x2_t __rv;
-@@ -8661,7 +9975,8 @@ vuzp_f32 (float32x2_t __a, float32x2_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vuzp_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   uint8x8x2_t __rv;
-@@ -8679,7 +9994,8 @@ vuzp_u8 (uint8x8_t __a, uint8x8_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vuzp_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   uint16x4x2_t __rv;
-@@ -8693,7 +10009,8 @@ vuzp_u16 (uint16x4_t __a, uint16x4_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vuzp_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   uint32x2x2_t __rv;
-@@ -8707,7 +10024,8 @@ vuzp_u32 (uint32x2_t __a, uint32x2_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vuzp_p8 (poly8x8_t __a, poly8x8_t __b)
- {
-   poly8x8x2_t __rv;
-@@ -8725,7 +10043,8 @@ vuzp_p8 (poly8x8_t __a, poly8x8_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vuzp_p16 (poly16x4_t __a, poly16x4_t __b)
- {
-   poly16x4x2_t __rv;
-@@ -8739,7 +10058,8 @@ vuzp_p16 (poly16x4_t __a, poly16x4_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vuzpq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   int8x16x2_t __rv;
-@@ -8757,7 +10077,8 @@ vuzpq_s8 (int8x16_t __a, int8x16_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vuzpq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   int16x8x2_t __rv;
-@@ -8775,7 +10096,8 @@ vuzpq_s16 (int16x8_t __a, int16x8_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vuzpq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   int32x4x2_t __rv;
-@@ -8789,7 +10111,8 @@ vuzpq_s32 (int32x4_t __a, int32x4_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vuzpq_f32 (float32x4_t __a, float32x4_t __b)
- {
-   float32x4x2_t __rv;
-@@ -8803,7 +10126,8 @@ vuzpq_f32 (float32x4_t __a, float32x4_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vuzpq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   uint8x16x2_t __rv;
-@@ -8821,7 +10145,8 @@ vuzpq_u8 (uint8x16_t __a, uint8x16_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vuzpq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   uint16x8x2_t __rv;
-@@ -8839,7 +10164,8 @@ vuzpq_u16 (uint16x8_t __a, uint16x8_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vuzpq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   uint32x4x2_t __rv;
-@@ -8853,7 +10179,8 @@ vuzpq_u32 (uint32x4_t __a, uint32x4_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vuzpq_p8 (poly8x16_t __a, poly8x16_t __b)
- {
-   poly8x16x2_t __rv;
-@@ -8871,7 +10198,8 @@ vuzpq_p8 (poly8x16_t __a, poly8x16_t __b)
-   return __rv;
- }
- 
--__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vuzpq_p16 (poly16x8_t __a, poly16x8_t __b)
- {
-   poly16x8x2_t __rv;
-@@ -8891,82 +10219,95 @@ vuzpq_p16 (poly16x8_t __a, poly16x8_t __b)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_p64 (const poly64_t * __a)
- {
-   return (poly64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_s8 (const int8_t * __a)
- {
-   return (int8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_s16 (const int16_t * __a)
- {
-   return (int16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_s32 (const int32_t * __a)
- {
-   return (int32x2_t)__builtin_neon_vld1v2si ((const __builtin_neon_si *) __a);
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_s64 (const int64_t * __a)
- {
-   return (int64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a);
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_f16 (const float16_t * __a)
- {
-   return __builtin_neon_vld1v4hf (__a);
- }
- #endif
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_f32 (const float32_t * __a)
- {
-   return (float32x2_t)__builtin_neon_vld1v2sf ((const __builtin_neon_sf *) __a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_u8 (const uint8_t * __a)
- {
-   return (uint8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_u16 (const uint16_t * __a)
- {
-   return (uint16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_u32 (const uint32_t * __a)
- {
-   return (uint32x2_t)__builtin_neon_vld1v2si ((const __builtin_neon_si *) __a);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_u64 (const uint64_t * __a)
- {
-   return (uint64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_p8 (const poly8_t * __a)
- {
-   return (poly8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_p16 (const poly16_t * __a)
- {
-   return (poly16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a);
-@@ -8974,144 +10315,167 @@ vld1_p16 (const poly16_t * __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_p64 (const poly64_t * __a)
- {
-   return (poly64x2_t)__builtin_neon_vld1v2di ((const __builtin_neon_di *) __a);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_s8 (const int8_t * __a)
- {
-   return (int8x16_t)__builtin_neon_vld1v16qi ((const __builtin_neon_qi *) __a);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_s16 (const int16_t * __a)
- {
-   return (int16x8_t)__builtin_neon_vld1v8hi ((const __builtin_neon_hi *) __a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_s32 (const int32_t * __a)
- {
-   return (int32x4_t)__builtin_neon_vld1v4si ((const __builtin_neon_si *) __a);
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_s64 (const int64_t * __a)
- {
-   return (int64x2_t)__builtin_neon_vld1v2di ((const __builtin_neon_di *) __a);
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_f16 (const float16_t * __a)
- {
-   return __builtin_neon_vld1v8hf (__a);
- }
- #endif
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_f32 (const float32_t * __a)
- {
-   return (float32x4_t)__builtin_neon_vld1v4sf ((const __builtin_neon_sf *) __a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_u8 (const uint8_t * __a)
- {
-   return (uint8x16_t)__builtin_neon_vld1v16qi ((const __builtin_neon_qi *) __a);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_u16 (const uint16_t * __a)
- {
-   return (uint16x8_t)__builtin_neon_vld1v8hi ((const __builtin_neon_hi *) __a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_u32 (const uint32_t * __a)
- {
-   return (uint32x4_t)__builtin_neon_vld1v4si ((const __builtin_neon_si *) __a);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_u64 (const uint64_t * __a)
- {
-   return (uint64x2_t)__builtin_neon_vld1v2di ((const __builtin_neon_di *) __a);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_p8 (const poly8_t * __a)
- {
-   return (poly8x16_t)__builtin_neon_vld1v16qi ((const __builtin_neon_qi *) __a);
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_p16 (const poly16_t * __a)
- {
-   return (poly16x8_t)__builtin_neon_vld1v8hi ((const __builtin_neon_hi *) __a);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_lane_s8 (const int8_t * __a, int8x8_t __b, const int __c)
- {
-   return (int8x8_t)__builtin_neon_vld1_lanev8qi ((const __builtin_neon_qi *) __a, __b, __c);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_lane_s16 (const int16_t * __a, int16x4_t __b, const int __c)
- {
-   return (int16x4_t)__builtin_neon_vld1_lanev4hi ((const __builtin_neon_hi *) __a, __b, __c);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_lane_s32 (const int32_t * __a, int32x2_t __b, const int __c)
- {
-   return (int32x2_t)__builtin_neon_vld1_lanev2si ((const __builtin_neon_si *) __a, __b, __c);
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_lane_f16 (const float16_t * __a, float16x4_t __b, const int __c)
- {
-   return vset_lane_f16 (*__a, __b, __c);
- }
- #endif
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_lane_f32 (const float32_t * __a, float32x2_t __b, const int __c)
- {
-   return (float32x2_t)__builtin_neon_vld1_lanev2sf ((const __builtin_neon_sf *) __a, __b, __c);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_lane_u8 (const uint8_t * __a, uint8x8_t __b, const int __c)
- {
-   return (uint8x8_t)__builtin_neon_vld1_lanev8qi ((const __builtin_neon_qi *) __a, (int8x8_t) __b, __c);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_lane_u16 (const uint16_t * __a, uint16x4_t __b, const int __c)
- {
-   return (uint16x4_t)__builtin_neon_vld1_lanev4hi ((const __builtin_neon_hi *) __a, (int16x4_t) __b, __c);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_lane_u32 (const uint32_t * __a, uint32x2_t __b, const int __c)
- {
-   return (uint32x2_t)__builtin_neon_vld1_lanev2si ((const __builtin_neon_si *) __a, (int32x2_t) __b, __c);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_lane_p8 (const poly8_t * __a, poly8x8_t __b, const int __c)
- {
-   return (poly8x8_t)__builtin_neon_vld1_lanev8qi ((const __builtin_neon_qi *) __a, (int8x8_t) __b, __c);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_lane_p16 (const poly16_t * __a, poly16x4_t __b, const int __c)
- {
-   return (poly16x4_t)__builtin_neon_vld1_lanev4hi ((const __builtin_neon_hi *) __a, (int16x4_t) __b, __c);
-@@ -9119,82 +10483,95 @@ vld1_lane_p16 (const poly16_t * __a, poly16x4_t __b, const int __c)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_lane_p64 (const poly64_t * __a, poly64x1_t __b, const int __c)
- {
-   return (poly64x1_t)__builtin_neon_vld1_lanedi ((const __builtin_neon_di *) __a, __b, __c);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_lane_s64 (const int64_t * __a, int64x1_t __b, const int __c)
- {
-   return (int64x1_t)__builtin_neon_vld1_lanedi ((const __builtin_neon_di *) __a, __b, __c);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_lane_u64 (const uint64_t * __a, uint64x1_t __b, const int __c)
- {
-   return (uint64x1_t)__builtin_neon_vld1_lanedi ((const __builtin_neon_di *) __a, (int64x1_t) __b, __c);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_lane_s8 (const int8_t * __a, int8x16_t __b, const int __c)
- {
-   return (int8x16_t)__builtin_neon_vld1_lanev16qi ((const __builtin_neon_qi *) __a, __b, __c);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_lane_s16 (const int16_t * __a, int16x8_t __b, const int __c)
- {
-   return (int16x8_t)__builtin_neon_vld1_lanev8hi ((const __builtin_neon_hi *) __a, __b, __c);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_lane_s32 (const int32_t * __a, int32x4_t __b, const int __c)
- {
-   return (int32x4_t)__builtin_neon_vld1_lanev4si ((const __builtin_neon_si *) __a, __b, __c);
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_lane_f16 (const float16_t * __a, float16x8_t __b, const int __c)
- {
-   return vsetq_lane_f16 (*__a, __b, __c);
- }
- #endif
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_lane_f32 (const float32_t * __a, float32x4_t __b, const int __c)
- {
-   return (float32x4_t)__builtin_neon_vld1_lanev4sf ((const __builtin_neon_sf *) __a, __b, __c);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_lane_u8 (const uint8_t * __a, uint8x16_t __b, const int __c)
- {
-   return (uint8x16_t)__builtin_neon_vld1_lanev16qi ((const __builtin_neon_qi *) __a, (int8x16_t) __b, __c);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_lane_u16 (const uint16_t * __a, uint16x8_t __b, const int __c)
- {
-   return (uint16x8_t)__builtin_neon_vld1_lanev8hi ((const __builtin_neon_hi *) __a, (int16x8_t) __b, __c);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_lane_u32 (const uint32_t * __a, uint32x4_t __b, const int __c)
- {
-   return (uint32x4_t)__builtin_neon_vld1_lanev4si ((const __builtin_neon_si *) __a, (int32x4_t) __b, __c);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_lane_p8 (const poly8_t * __a, poly8x16_t __b, const int __c)
- {
-   return (poly8x16_t)__builtin_neon_vld1_lanev16qi ((const __builtin_neon_qi *) __a, (int8x16_t) __b, __c);
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_lane_p16 (const poly16_t * __a, poly16x8_t __b, const int __c)
- {
-   return (poly16x8_t)__builtin_neon_vld1_lanev8hi ((const __builtin_neon_hi *) __a, (int16x8_t) __b, __c);
-@@ -9202,45 +10579,52 @@ vld1q_lane_p16 (const poly16_t * __a, poly16x8_t __b, const int __c)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_lane_p64 (const poly64_t * __a, poly64x2_t __b, const int __c)
- {
-   return (poly64x2_t)__builtin_neon_vld1_lanev2di ((const __builtin_neon_di *) __a, (int64x2_t) __b, __c);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_lane_s64 (const int64_t * __a, int64x2_t __b, const int __c)
- {
-   return (int64x2_t)__builtin_neon_vld1_lanev2di ((const __builtin_neon_di *) __a, __b, __c);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_lane_u64 (const uint64_t * __a, uint64x2_t __b, const int __c)
- {
-   return (uint64x2_t)__builtin_neon_vld1_lanev2di ((const __builtin_neon_di *) __a, (int64x2_t) __b, __c);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_dup_s8 (const int8_t * __a)
- {
-   return (int8x8_t)__builtin_neon_vld1_dupv8qi ((const __builtin_neon_qi *) __a);
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_dup_s16 (const int16_t * __a)
- {
-   return (int16x4_t)__builtin_neon_vld1_dupv4hi ((const __builtin_neon_hi *) __a);
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_dup_s32 (const int32_t * __a)
- {
-   return (int32x2_t)__builtin_neon_vld1_dupv2si ((const __builtin_neon_si *) __a);
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_dup_f16 (const float16_t * __a)
- {
-   float16_t __f = *__a;
-@@ -9248,37 +10632,43 @@ vld1_dup_f16 (const float16_t * __a)
- }
- #endif
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_dup_f32 (const float32_t * __a)
- {
-   return (float32x2_t)__builtin_neon_vld1_dupv2sf ((const __builtin_neon_sf *) __a);
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_dup_u8 (const uint8_t * __a)
- {
-   return (uint8x8_t)__builtin_neon_vld1_dupv8qi ((const __builtin_neon_qi *) __a);
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_dup_u16 (const uint16_t * __a)
- {
-   return (uint16x4_t)__builtin_neon_vld1_dupv4hi ((const __builtin_neon_hi *) __a);
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_dup_u32 (const uint32_t * __a)
- {
-   return (uint32x2_t)__builtin_neon_vld1_dupv2si ((const __builtin_neon_si *) __a);
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_dup_p8 (const poly8_t * __a)
- {
-   return (poly8x8_t)__builtin_neon_vld1_dupv8qi ((const __builtin_neon_qi *) __a);
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_dup_p16 (const poly16_t * __a)
- {
-   return (poly16x4_t)__builtin_neon_vld1_dupv4hi ((const __builtin_neon_hi *) __a);
-@@ -9286,45 +10676,52 @@ vld1_dup_p16 (const poly16_t * __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_dup_p64 (const poly64_t * __a)
- {
-   return (poly64x1_t)__builtin_neon_vld1_dupdi ((const __builtin_neon_di *) __a);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_dup_s64 (const int64_t * __a)
- {
-   return (int64x1_t)__builtin_neon_vld1_dupdi ((const __builtin_neon_di *) __a);
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1_dup_u64 (const uint64_t * __a)
- {
-   return (uint64x1_t)__builtin_neon_vld1_dupdi ((const __builtin_neon_di *) __a);
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_dup_s8 (const int8_t * __a)
- {
-   return (int8x16_t)__builtin_neon_vld1_dupv16qi ((const __builtin_neon_qi *) __a);
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_dup_s16 (const int16_t * __a)
- {
-   return (int16x8_t)__builtin_neon_vld1_dupv8hi ((const __builtin_neon_hi *) __a);
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_dup_s32 (const int32_t * __a)
- {
-   return (int32x4_t)__builtin_neon_vld1_dupv4si ((const __builtin_neon_si *) __a);
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_dup_f16 (const float16_t * __a)
- {
-   float16_t __f = *__a;
-@@ -9332,37 +10729,43 @@ vld1q_dup_f16 (const float16_t * __a)
- }
- #endif
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_dup_f32 (const float32_t * __a)
- {
-   return (float32x4_t)__builtin_neon_vld1_dupv4sf ((const __builtin_neon_sf *) __a);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_dup_u8 (const uint8_t * __a)
- {
-   return (uint8x16_t)__builtin_neon_vld1_dupv16qi ((const __builtin_neon_qi *) __a);
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_dup_u16 (const uint16_t * __a)
- {
-   return (uint16x8_t)__builtin_neon_vld1_dupv8hi ((const __builtin_neon_hi *) __a);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_dup_u32 (const uint32_t * __a)
- {
-   return (uint32x4_t)__builtin_neon_vld1_dupv4si ((const __builtin_neon_si *) __a);
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_dup_p8 (const poly8_t * __a)
- {
-   return (poly8x16_t)__builtin_neon_vld1_dupv16qi ((const __builtin_neon_qi *) __a);
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_dup_p16 (const poly16_t * __a)
- {
-   return (poly16x8_t)__builtin_neon_vld1_dupv8hi ((const __builtin_neon_hi *) __a);
-@@ -9370,20 +10773,23 @@ vld1q_dup_p16 (const poly16_t * __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_dup_p64 (const poly64_t * __a)
- {
-   return (poly64x2_t)__builtin_neon_vld1_dupv2di ((const __builtin_neon_di *) __a);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_dup_s64 (const int64_t * __a)
- {
-   return (int64x2_t)__builtin_neon_vld1_dupv2di ((const __builtin_neon_di *) __a);
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld1q_dup_u64 (const uint64_t * __a)
- {
-   return (uint64x2_t)__builtin_neon_vld1_dupv2di ((const __builtin_neon_di *) __a);
-@@ -9391,82 +10797,95 @@ vld1q_dup_u64 (const uint64_t * __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_p64 (poly64_t * __a, poly64x1_t __b)
- {
-   __builtin_neon_vst1di ((__builtin_neon_di *) __a, __b);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_s8 (int8_t * __a, int8x8_t __b)
- {
-   __builtin_neon_vst1v8qi ((__builtin_neon_qi *) __a, __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_s16 (int16_t * __a, int16x4_t __b)
- {
-   __builtin_neon_vst1v4hi ((__builtin_neon_hi *) __a, __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_s32 (int32_t * __a, int32x2_t __b)
- {
-   __builtin_neon_vst1v2si ((__builtin_neon_si *) __a, __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_s64 (int64_t * __a, int64x1_t __b)
- {
-   __builtin_neon_vst1di ((__builtin_neon_di *) __a, __b);
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_f16 (float16_t * __a, float16x4_t __b)
- {
-   __builtin_neon_vst1v4hf (__a, __b);
- }
- #endif
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_f32 (float32_t * __a, float32x2_t __b)
- {
-   __builtin_neon_vst1v2sf ((__builtin_neon_sf *) __a, __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_u8 (uint8_t * __a, uint8x8_t __b)
- {
-   __builtin_neon_vst1v8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_u16 (uint16_t * __a, uint16x4_t __b)
- {
-   __builtin_neon_vst1v4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_u32 (uint32_t * __a, uint32x2_t __b)
- {
-   __builtin_neon_vst1v2si ((__builtin_neon_si *) __a, (int32x2_t) __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_u64 (uint64_t * __a, uint64x1_t __b)
- {
-   __builtin_neon_vst1di ((__builtin_neon_di *) __a, (int64x1_t) __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_p8 (poly8_t * __a, poly8x8_t __b)
- {
-   __builtin_neon_vst1v8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_p16 (poly16_t * __a, poly16x4_t __b)
- {
-   __builtin_neon_vst1v4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b);
-@@ -9474,144 +10893,167 @@ vst1_p16 (poly16_t * __a, poly16x4_t __b)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_p64 (poly64_t * __a, poly64x2_t __b)
- {
-   __builtin_neon_vst1v2di ((__builtin_neon_di *) __a, (int64x2_t) __b);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_s8 (int8_t * __a, int8x16_t __b)
- {
-   __builtin_neon_vst1v16qi ((__builtin_neon_qi *) __a, __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_s16 (int16_t * __a, int16x8_t __b)
- {
-   __builtin_neon_vst1v8hi ((__builtin_neon_hi *) __a, __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_s32 (int32_t * __a, int32x4_t __b)
- {
-   __builtin_neon_vst1v4si ((__builtin_neon_si *) __a, __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_s64 (int64_t * __a, int64x2_t __b)
- {
-   __builtin_neon_vst1v2di ((__builtin_neon_di *) __a, __b);
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_f16 (float16_t * __a, float16x8_t __b)
- {
-   __builtin_neon_vst1v8hf (__a, __b);
- }
- #endif
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_f32 (float32_t * __a, float32x4_t __b)
- {
-   __builtin_neon_vst1v4sf ((__builtin_neon_sf *) __a, __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_u8 (uint8_t * __a, uint8x16_t __b)
- {
-   __builtin_neon_vst1v16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_u16 (uint16_t * __a, uint16x8_t __b)
- {
-   __builtin_neon_vst1v8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_u32 (uint32_t * __a, uint32x4_t __b)
- {
-   __builtin_neon_vst1v4si ((__builtin_neon_si *) __a, (int32x4_t) __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_u64 (uint64_t * __a, uint64x2_t __b)
- {
-   __builtin_neon_vst1v2di ((__builtin_neon_di *) __a, (int64x2_t) __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_p8 (poly8_t * __a, poly8x16_t __b)
- {
-   __builtin_neon_vst1v16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_p16 (poly16_t * __a, poly16x8_t __b)
- {
-   __builtin_neon_vst1v8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_lane_s8 (int8_t * __a, int8x8_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanev8qi ((__builtin_neon_qi *) __a, __b, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_lane_s16 (int16_t * __a, int16x4_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanev4hi ((__builtin_neon_hi *) __a, __b, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_lane_s32 (int32_t * __a, int32x2_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanev2si ((__builtin_neon_si *) __a, __b, __c);
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_lane_f16 (float16_t * __a, float16x4_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanev4hf (__a, __b, __c);
- }
- #endif
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_lane_f32 (float32_t * __a, float32x2_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanev2sf ((__builtin_neon_sf *) __a, __b, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_lane_u8 (uint8_t * __a, uint8x8_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanev8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_lane_u16 (uint16_t * __a, uint16x4_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanev4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_lane_u32 (uint32_t * __a, uint32x2_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanev2si ((__builtin_neon_si *) __a, (int32x2_t) __b, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_lane_p8 (poly8_t * __a, poly8x8_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanev8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_lane_p16 (poly16_t * __a, poly16x4_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanev4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b, __c);
-@@ -9619,82 +11061,95 @@ vst1_lane_p16 (poly16_t * __a, poly16x4_t __b, const int __c)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_lane_p64 (poly64_t * __a, poly64x1_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanedi ((__builtin_neon_di *) __a, __b, __c);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_lane_s64 (int64_t * __a, int64x1_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanedi ((__builtin_neon_di *) __a, __b, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1_lane_u64 (uint64_t * __a, uint64x1_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanedi ((__builtin_neon_di *) __a, (int64x1_t) __b, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_lane_s8 (int8_t * __a, int8x16_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanev16qi ((__builtin_neon_qi *) __a, __b, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_lane_s16 (int16_t * __a, int16x8_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanev8hi ((__builtin_neon_hi *) __a, __b, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_lane_s32 (int32_t * __a, int32x4_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanev4si ((__builtin_neon_si *) __a, __b, __c);
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_lane_f16 (float16_t * __a, float16x8_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanev8hf (__a, __b, __c);
- }
- #endif
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_lane_f32 (float32_t * __a, float32x4_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanev4sf ((__builtin_neon_sf *) __a, __b, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_lane_u8 (uint8_t * __a, uint8x16_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanev16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_lane_u16 (uint16_t * __a, uint16x8_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanev8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_lane_u32 (uint32_t * __a, uint32x4_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanev4si ((__builtin_neon_si *) __a, (int32x4_t) __b, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_lane_p8 (poly8_t * __a, poly8x16_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanev16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_lane_p16 (poly16_t * __a, poly16x8_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanev8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b, __c);
-@@ -9702,26 +11157,30 @@ vst1q_lane_p16 (poly16_t * __a, poly16x8_t __b, const int __c)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_lane_p64 (poly64_t * __a, poly64x2_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanev2di ((__builtin_neon_di *) __a, (int64x2_t) __b, __c);
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_lane_s64 (int64_t * __a, int64x2_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanev2di ((__builtin_neon_di *) __a, __b, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst1q_lane_u64 (uint64_t * __a, uint64x2_t __b, const int __c)
- {
-   __builtin_neon_vst1_lanev2di ((__builtin_neon_di *) __a, (int64x2_t) __b, __c);
- }
- 
--__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_s8 (const int8_t * __a)
- {
-   union { int8x8x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -9729,7 +11188,8 @@ vld2_s8 (const int8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_s16 (const int16_t * __a)
- {
-   union { int16x4x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -9737,7 +11197,8 @@ vld2_s16 (const int16_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_s32 (const int32_t * __a)
- {
-   union { int32x2x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -9746,7 +11207,8 @@ vld2_s32 (const int32_t * __a)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_f16 (const float16_t * __a)
- {
-   union { float16x4x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -9755,7 +11217,8 @@ vld2_f16 (const float16_t * __a)
- }
- #endif
- 
--__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_f32 (const float32_t * __a)
- {
-   union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -9763,7 +11226,8 @@ vld2_f32 (const float32_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_u8 (const uint8_t * __a)
- {
-   union { uint8x8x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -9771,7 +11235,8 @@ vld2_u8 (const uint8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_u16 (const uint16_t * __a)
- {
-   union { uint16x4x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -9779,7 +11244,8 @@ vld2_u16 (const uint16_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_u32 (const uint32_t * __a)
- {
-   union { uint32x2x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -9787,7 +11253,8 @@ vld2_u32 (const uint32_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_p8 (const poly8_t * __a)
- {
-   union { poly8x8x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -9795,7 +11262,8 @@ vld2_p8 (const poly8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_p16 (const poly16_t * __a)
- {
-   union { poly16x4x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -9805,7 +11273,8 @@ vld2_p16 (const poly16_t * __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x1x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_p64 (const poly64_t * __a)
- {
-   union { poly64x1x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -9814,7 +11283,8 @@ vld2_p64 (const poly64_t * __a)
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_s64 (const int64_t * __a)
- {
-   union { int64x1x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -9822,7 +11292,8 @@ vld2_s64 (const int64_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint64x1x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_u64 (const uint64_t * __a)
- {
-   union { uint64x1x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -9830,7 +11301,8 @@ vld2_u64 (const uint64_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2q_s8 (const int8_t * __a)
- {
-   union { int8x16x2_t __i; __builtin_neon_oi __o; } __rv;
-@@ -9838,7 +11310,8 @@ vld2q_s8 (const int8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2q_s16 (const int16_t * __a)
- {
-   union { int16x8x2_t __i; __builtin_neon_oi __o; } __rv;
-@@ -9846,7 +11319,8 @@ vld2q_s16 (const int16_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2q_s32 (const int32_t * __a)
- {
-   union { int32x4x2_t __i; __builtin_neon_oi __o; } __rv;
-@@ -9855,7 +11329,8 @@ vld2q_s32 (const int32_t * __a)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2q_f16 (const float16_t * __a)
- {
-   union { float16x8x2_t __i; __builtin_neon_oi __o; } __rv;
-@@ -9864,7 +11339,8 @@ vld2q_f16 (const float16_t * __a)
- }
- #endif
- 
--__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2q_f32 (const float32_t * __a)
- {
-   union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv;
-@@ -9872,7 +11348,8 @@ vld2q_f32 (const float32_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2q_u8 (const uint8_t * __a)
- {
-   union { uint8x16x2_t __i; __builtin_neon_oi __o; } __rv;
-@@ -9880,7 +11357,8 @@ vld2q_u8 (const uint8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2q_u16 (const uint16_t * __a)
- {
-   union { uint16x8x2_t __i; __builtin_neon_oi __o; } __rv;
-@@ -9888,7 +11366,8 @@ vld2q_u16 (const uint16_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2q_u32 (const uint32_t * __a)
- {
-   union { uint32x4x2_t __i; __builtin_neon_oi __o; } __rv;
-@@ -9896,7 +11375,8 @@ vld2q_u32 (const uint32_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2q_p8 (const poly8_t * __a)
- {
-   union { poly8x16x2_t __i; __builtin_neon_oi __o; } __rv;
-@@ -9904,7 +11384,8 @@ vld2q_p8 (const poly8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2q_p16 (const poly16_t * __a)
- {
-   union { poly16x8x2_t __i; __builtin_neon_oi __o; } __rv;
-@@ -9912,7 +11393,8 @@ vld2q_p16 (const poly16_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_lane_s8 (const int8_t * __a, int8x8x2_t __b, const int __c)
- {
-   union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-@@ -9921,7 +11403,8 @@ vld2_lane_s8 (const int8_t * __a, int8x8x2_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_lane_s16 (const int16_t * __a, int16x4x2_t __b, const int __c)
- {
-   union { int16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-@@ -9930,7 +11413,8 @@ vld2_lane_s16 (const int16_t * __a, int16x4x2_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_lane_s32 (const int32_t * __a, int32x2x2_t __b, const int __c)
- {
-   union { int32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-@@ -9940,7 +11424,8 @@ vld2_lane_s32 (const int32_t * __a, int32x2x2_t __b, const int __c)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_lane_f16 (const float16_t * __a, float16x4x2_t __b, const int __c)
- {
-   union { float16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-@@ -9950,7 +11435,8 @@ vld2_lane_f16 (const float16_t * __a, float16x4x2_t __b, const int __c)
- }
- #endif
- 
--__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_lane_f32 (const float32_t * __a, float32x2x2_t __b, const int __c)
- {
-   union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-@@ -9959,7 +11445,8 @@ vld2_lane_f32 (const float32_t * __a, float32x2x2_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_lane_u8 (const uint8_t * __a, uint8x8x2_t __b, const int __c)
- {
-   union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-@@ -9968,7 +11455,8 @@ vld2_lane_u8 (const uint8_t * __a, uint8x8x2_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_lane_u16 (const uint16_t * __a, uint16x4x2_t __b, const int __c)
- {
-   union { uint16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-@@ -9977,7 +11465,8 @@ vld2_lane_u16 (const uint16_t * __a, uint16x4x2_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_lane_u32 (const uint32_t * __a, uint32x2x2_t __b, const int __c)
- {
-   union { uint32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-@@ -9986,7 +11475,8 @@ vld2_lane_u32 (const uint32_t * __a, uint32x2x2_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_lane_p8 (const poly8_t * __a, poly8x8x2_t __b, const int __c)
- {
-   union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-@@ -9995,7 +11485,8 @@ vld2_lane_p8 (const poly8_t * __a, poly8x8x2_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_lane_p16 (const poly16_t * __a, poly16x4x2_t __b, const int __c)
- {
-   union { poly16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-@@ -10004,7 +11495,8 @@ vld2_lane_p16 (const poly16_t * __a, poly16x4x2_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2q_lane_s16 (const int16_t * __a, int16x8x2_t __b, const int __c)
- {
-   union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -10013,7 +11505,8 @@ vld2q_lane_s16 (const int16_t * __a, int16x8x2_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2q_lane_s32 (const int32_t * __a, int32x4x2_t __b, const int __c)
- {
-   union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -10023,7 +11516,8 @@ vld2q_lane_s32 (const int32_t * __a, int32x4x2_t __b, const int __c)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2q_lane_f16 (const float16_t * __a, float16x8x2_t __b, const int __c)
- {
-   union { float16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -10033,7 +11527,8 @@ vld2q_lane_f16 (const float16_t * __a, float16x8x2_t __b, const int __c)
- }
- #endif
- 
--__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2q_lane_f32 (const float32_t * __a, float32x4x2_t __b, const int __c)
- {
-   union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -10042,7 +11537,8 @@ vld2q_lane_f32 (const float32_t * __a, float32x4x2_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2q_lane_u16 (const uint16_t * __a, uint16x8x2_t __b, const int __c)
- {
-   union { uint16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -10051,7 +11547,8 @@ vld2q_lane_u16 (const uint16_t * __a, uint16x8x2_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2q_lane_u32 (const uint32_t * __a, uint32x4x2_t __b, const int __c)
- {
-   union { uint32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -10060,7 +11557,8 @@ vld2q_lane_u32 (const uint32_t * __a, uint32x4x2_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2q_lane_p16 (const poly16_t * __a, poly16x8x2_t __b, const int __c)
- {
-   union { poly16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -10069,7 +11567,8 @@ vld2q_lane_p16 (const poly16_t * __a, poly16x8x2_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_dup_s8 (const int8_t * __a)
- {
-   union { int8x8x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -10077,7 +11576,8 @@ vld2_dup_s8 (const int8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_dup_s16 (const int16_t * __a)
- {
-   union { int16x4x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -10085,7 +11585,8 @@ vld2_dup_s16 (const int16_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_dup_s32 (const int32_t * __a)
- {
-   union { int32x2x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -10094,7 +11595,8 @@ vld2_dup_s32 (const int32_t * __a)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_dup_f16 (const float16_t * __a)
- {
-   union { float16x4x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -10103,7 +11605,8 @@ vld2_dup_f16 (const float16_t * __a)
- }
- #endif
- 
--__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_dup_f32 (const float32_t * __a)
- {
-   union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -10111,7 +11614,8 @@ vld2_dup_f32 (const float32_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_dup_u8 (const uint8_t * __a)
- {
-   union { uint8x8x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -10119,7 +11623,8 @@ vld2_dup_u8 (const uint8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_dup_u16 (const uint16_t * __a)
- {
-   union { uint16x4x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -10127,7 +11632,8 @@ vld2_dup_u16 (const uint16_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_dup_u32 (const uint32_t * __a)
- {
-   union { uint32x2x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -10135,7 +11641,8 @@ vld2_dup_u32 (const uint32_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_dup_p8 (const poly8_t * __a)
- {
-   union { poly8x8x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -10143,7 +11650,8 @@ vld2_dup_p8 (const poly8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_dup_p16 (const poly16_t * __a)
- {
-   union { poly16x4x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -10153,7 +11661,8 @@ vld2_dup_p16 (const poly16_t * __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x1x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_dup_p64 (const poly64_t * __a)
- {
-   union { poly64x1x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -10162,7 +11671,8 @@ vld2_dup_p64 (const poly64_t * __a)
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_dup_s64 (const int64_t * __a)
- {
-   union { int64x1x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -10170,7 +11680,8 @@ vld2_dup_s64 (const int64_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint64x1x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld2_dup_u64 (const uint64_t * __a)
- {
-   union { uint64x1x2_t __i; __builtin_neon_ti __o; } __rv;
-@@ -10178,21 +11689,24 @@ vld2_dup_u64 (const uint64_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2_s8 (int8_t * __a, int8x8x2_t __b)
- {
-   union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-   __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2_s16 (int16_t * __a, int16x4x2_t __b)
- {
-   union { int16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-   __builtin_neon_vst2v4hi ((__builtin_neon_hi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2_s32 (int32_t * __a, int32x2x2_t __b)
- {
-   union { int32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-@@ -10200,7 +11714,8 @@ vst2_s32 (int32_t * __a, int32x2x2_t __b)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2_f16 (float16_t * __a, float16x4x2_t __b)
- {
-   union { float16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-@@ -10208,42 +11723,48 @@ vst2_f16 (float16_t * __a, float16x4x2_t __b)
- }
- #endif
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2_f32 (float32_t * __a, float32x2x2_t __b)
- {
-   union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-   __builtin_neon_vst2v2sf ((__builtin_neon_sf *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2_u8 (uint8_t * __a, uint8x8x2_t __b)
- {
-   union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-   __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2_u16 (uint16_t * __a, uint16x4x2_t __b)
- {
-   union { uint16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-   __builtin_neon_vst2v4hi ((__builtin_neon_hi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2_u32 (uint32_t * __a, uint32x2x2_t __b)
- {
-   union { uint32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-   __builtin_neon_vst2v2si ((__builtin_neon_si *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2_p8 (poly8_t * __a, poly8x8x2_t __b)
- {
-   union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-   __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2_p16 (poly16_t * __a, poly16x4x2_t __b)
- {
-   union { poly16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-@@ -10252,7 +11773,8 @@ vst2_p16 (poly16_t * __a, poly16x4x2_t __b)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2_p64 (poly64_t * __a, poly64x1x2_t __b)
- {
-   union { poly64x1x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-@@ -10260,35 +11782,40 @@ vst2_p64 (poly64_t * __a, poly64x1x2_t __b)
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2_s64 (int64_t * __a, int64x1x2_t __b)
- {
-   union { int64x1x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-   __builtin_neon_vst2di ((__builtin_neon_di *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2_u64 (uint64_t * __a, uint64x1x2_t __b)
- {
-   union { uint64x1x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-   __builtin_neon_vst2di ((__builtin_neon_di *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2q_s8 (int8_t * __a, int8x16x2_t __b)
- {
-   union { int8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst2v16qi ((__builtin_neon_qi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2q_s16 (int16_t * __a, int16x8x2_t __b)
- {
-   union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst2v8hi ((__builtin_neon_hi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2q_s32 (int32_t * __a, int32x4x2_t __b)
- {
-   union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -10296,7 +11823,8 @@ vst2q_s32 (int32_t * __a, int32x4x2_t __b)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2q_f16 (float16_t * __a, float16x8x2_t __b)
- {
-   union { float16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -10304,63 +11832,72 @@ vst2q_f16 (float16_t * __a, float16x8x2_t __b)
- }
- #endif
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2q_f32 (float32_t * __a, float32x4x2_t __b)
- {
-   union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst2v4sf ((__builtin_neon_sf *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2q_u8 (uint8_t * __a, uint8x16x2_t __b)
- {
-   union { uint8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst2v16qi ((__builtin_neon_qi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2q_u16 (uint16_t * __a, uint16x8x2_t __b)
- {
-   union { uint16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst2v8hi ((__builtin_neon_hi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2q_u32 (uint32_t * __a, uint32x4x2_t __b)
- {
-   union { uint32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst2v4si ((__builtin_neon_si *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2q_p8 (poly8_t * __a, poly8x16x2_t __b)
- {
-   union { poly8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst2v16qi ((__builtin_neon_qi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2q_p16 (poly16_t * __a, poly16x8x2_t __b)
- {
-   union { poly16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst2v8hi ((__builtin_neon_hi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2_lane_s8 (int8_t * __a, int8x8x2_t __b, const int __c)
- {
-   union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-   __builtin_neon_vst2_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2_lane_s16 (int16_t * __a, int16x4x2_t __b, const int __c)
- {
-   union { int16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-   __builtin_neon_vst2_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2_lane_s32 (int32_t * __a, int32x2x2_t __b, const int __c)
- {
-   union { int32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-@@ -10368,7 +11905,8 @@ vst2_lane_s32 (int32_t * __a, int32x2x2_t __b, const int __c)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2_lane_f16 (float16_t * __a, float16x4x2_t __b, const int __c)
- {
-   union { float16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-@@ -10376,56 +11914,64 @@ vst2_lane_f16 (float16_t * __a, float16x4x2_t __b, const int __c)
- }
- #endif
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2_lane_f32 (float32_t * __a, float32x2x2_t __b, const int __c)
- {
-   union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-   __builtin_neon_vst2_lanev2sf ((__builtin_neon_sf *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2_lane_u8 (uint8_t * __a, uint8x8x2_t __b, const int __c)
- {
-   union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-   __builtin_neon_vst2_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2_lane_u16 (uint16_t * __a, uint16x4x2_t __b, const int __c)
- {
-   union { uint16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-   __builtin_neon_vst2_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2_lane_u32 (uint32_t * __a, uint32x2x2_t __b, const int __c)
- {
-   union { uint32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-   __builtin_neon_vst2_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2_lane_p8 (poly8_t * __a, poly8x8x2_t __b, const int __c)
- {
-   union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-   __builtin_neon_vst2_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2_lane_p16 (poly16_t * __a, poly16x4x2_t __b, const int __c)
- {
-   union { poly16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-   __builtin_neon_vst2_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2q_lane_s16 (int16_t * __a, int16x8x2_t __b, const int __c)
- {
-   union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst2_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2q_lane_s32 (int32_t * __a, int32x4x2_t __b, const int __c)
- {
-   union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -10433,7 +11979,8 @@ vst2q_lane_s32 (int32_t * __a, int32x4x2_t __b, const int __c)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2q_lane_f16 (float16_t * __a, float16x8x2_t __b, const int __c)
- {
-   union { float16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -10441,35 +11988,40 @@ vst2q_lane_f16 (float16_t * __a, float16x8x2_t __b, const int __c)
- }
- #endif
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2q_lane_f32 (float32_t * __a, float32x4x2_t __b, const int __c)
- {
-   union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst2_lanev4sf ((__builtin_neon_sf *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2q_lane_u16 (uint16_t * __a, uint16x8x2_t __b, const int __c)
- {
-   union { uint16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst2_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2q_lane_u32 (uint32_t * __a, uint32x4x2_t __b, const int __c)
- {
-   union { uint32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst2_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst2q_lane_p16 (poly16_t * __a, poly16x8x2_t __b, const int __c)
- {
-   union { poly16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst2_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_s8 (const int8_t * __a)
- {
-   union { int8x8x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10477,7 +12029,8 @@ vld3_s8 (const int8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_s16 (const int16_t * __a)
- {
-   union { int16x4x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10485,7 +12038,8 @@ vld3_s16 (const int16_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_s32 (const int32_t * __a)
- {
-   union { int32x2x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10494,7 +12048,8 @@ vld3_s32 (const int32_t * __a)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_f16 (const float16_t * __a)
- {
-   union { float16x4x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10503,7 +12058,8 @@ vld3_f16 (const float16_t * __a)
- }
- #endif
- 
--__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_f32 (const float32_t * __a)
- {
-   union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10511,7 +12067,8 @@ vld3_f32 (const float32_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_u8 (const uint8_t * __a)
- {
-   union { uint8x8x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10519,7 +12076,8 @@ vld3_u8 (const uint8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_u16 (const uint16_t * __a)
- {
-   union { uint16x4x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10527,7 +12085,8 @@ vld3_u16 (const uint16_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_u32 (const uint32_t * __a)
- {
-   union { uint32x2x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10535,7 +12094,8 @@ vld3_u32 (const uint32_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_p8 (const poly8_t * __a)
- {
-   union { poly8x8x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10543,7 +12103,8 @@ vld3_p8 (const poly8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_p16 (const poly16_t * __a)
- {
-   union { poly16x4x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10553,7 +12114,8 @@ vld3_p16 (const poly16_t * __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x1x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_p64 (const poly64_t * __a)
- {
-   union { poly64x1x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10562,7 +12124,8 @@ vld3_p64 (const poly64_t * __a)
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_s64 (const int64_t * __a)
- {
-   union { int64x1x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10570,7 +12133,8 @@ vld3_s64 (const int64_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint64x1x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_u64 (const uint64_t * __a)
- {
-   union { uint64x1x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10578,7 +12142,8 @@ vld3_u64 (const uint64_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int8x16x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3q_s8 (const int8_t * __a)
- {
-   union { int8x16x3_t __i; __builtin_neon_ci __o; } __rv;
-@@ -10586,7 +12151,8 @@ vld3q_s8 (const int8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3q_s16 (const int16_t * __a)
- {
-   union { int16x8x3_t __i; __builtin_neon_ci __o; } __rv;
-@@ -10594,7 +12160,8 @@ vld3q_s16 (const int16_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3q_s32 (const int32_t * __a)
- {
-   union { int32x4x3_t __i; __builtin_neon_ci __o; } __rv;
-@@ -10603,7 +12170,8 @@ vld3q_s32 (const int32_t * __a)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3q_f16 (const float16_t * __a)
- {
-   union { float16x8x3_t __i; __builtin_neon_ci __o; } __rv;
-@@ -10612,7 +12180,8 @@ vld3q_f16 (const float16_t * __a)
- }
- #endif
- 
--__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3q_f32 (const float32_t * __a)
- {
-   union { float32x4x3_t __i; __builtin_neon_ci __o; } __rv;
-@@ -10620,7 +12189,8 @@ vld3q_f32 (const float32_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint8x16x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3q_u8 (const uint8_t * __a)
- {
-   union { uint8x16x3_t __i; __builtin_neon_ci __o; } __rv;
-@@ -10628,7 +12198,8 @@ vld3q_u8 (const uint8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3q_u16 (const uint16_t * __a)
- {
-   union { uint16x8x3_t __i; __builtin_neon_ci __o; } __rv;
-@@ -10636,7 +12207,8 @@ vld3q_u16 (const uint16_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3q_u32 (const uint32_t * __a)
- {
-   union { uint32x4x3_t __i; __builtin_neon_ci __o; } __rv;
-@@ -10644,7 +12216,8 @@ vld3q_u32 (const uint32_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly8x16x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3q_p8 (const poly8_t * __a)
- {
-   union { poly8x16x3_t __i; __builtin_neon_ci __o; } __rv;
-@@ -10652,7 +12225,8 @@ vld3q_p8 (const poly8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3q_p16 (const poly16_t * __a)
- {
-   union { poly16x8x3_t __i; __builtin_neon_ci __o; } __rv;
-@@ -10660,7 +12234,8 @@ vld3q_p16 (const poly16_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_lane_s8 (const int8_t * __a, int8x8x3_t __b, const int __c)
- {
-   union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-@@ -10669,7 +12244,8 @@ vld3_lane_s8 (const int8_t * __a, int8x8x3_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_lane_s16 (const int16_t * __a, int16x4x3_t __b, const int __c)
- {
-   union { int16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-@@ -10678,7 +12254,8 @@ vld3_lane_s16 (const int16_t * __a, int16x4x3_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_lane_s32 (const int32_t * __a, int32x2x3_t __b, const int __c)
- {
-   union { int32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-@@ -10688,7 +12265,8 @@ vld3_lane_s32 (const int32_t * __a, int32x2x3_t __b, const int __c)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_lane_f16 (const float16_t * __a, float16x4x3_t __b, const int __c)
- {
-   union { float16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-@@ -10698,7 +12276,8 @@ vld3_lane_f16 (const float16_t * __a, float16x4x3_t __b, const int __c)
- }
- #endif
- 
--__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_lane_f32 (const float32_t * __a, float32x2x3_t __b, const int __c)
- {
-   union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-@@ -10707,7 +12286,8 @@ vld3_lane_f32 (const float32_t * __a, float32x2x3_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_lane_u8 (const uint8_t * __a, uint8x8x3_t __b, const int __c)
- {
-   union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-@@ -10716,7 +12296,8 @@ vld3_lane_u8 (const uint8_t * __a, uint8x8x3_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_lane_u16 (const uint16_t * __a, uint16x4x3_t __b, const int __c)
- {
-   union { uint16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-@@ -10725,7 +12306,8 @@ vld3_lane_u16 (const uint16_t * __a, uint16x4x3_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_lane_u32 (const uint32_t * __a, uint32x2x3_t __b, const int __c)
- {
-   union { uint32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-@@ -10734,7 +12316,8 @@ vld3_lane_u32 (const uint32_t * __a, uint32x2x3_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_lane_p8 (const poly8_t * __a, poly8x8x3_t __b, const int __c)
- {
-   union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-@@ -10743,7 +12326,8 @@ vld3_lane_p8 (const poly8_t * __a, poly8x8x3_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_lane_p16 (const poly16_t * __a, poly16x4x3_t __b, const int __c)
- {
-   union { poly16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-@@ -10752,7 +12336,8 @@ vld3_lane_p16 (const poly16_t * __a, poly16x4x3_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3q_lane_s16 (const int16_t * __a, int16x8x3_t __b, const int __c)
- {
-   union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-@@ -10761,7 +12346,8 @@ vld3q_lane_s16 (const int16_t * __a, int16x8x3_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3q_lane_s32 (const int32_t * __a, int32x4x3_t __b, const int __c)
- {
-   union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-@@ -10771,7 +12357,8 @@ vld3q_lane_s32 (const int32_t * __a, int32x4x3_t __b, const int __c)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3q_lane_f16 (const float16_t * __a, float16x8x3_t __b, const int __c)
- {
-   union { float16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-@@ -10781,7 +12368,8 @@ vld3q_lane_f16 (const float16_t * __a, float16x8x3_t __b, const int __c)
- }
- #endif
- 
--__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3q_lane_f32 (const float32_t * __a, float32x4x3_t __b, const int __c)
- {
-   union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-@@ -10790,7 +12378,8 @@ vld3q_lane_f32 (const float32_t * __a, float32x4x3_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3q_lane_u16 (const uint16_t * __a, uint16x8x3_t __b, const int __c)
- {
-   union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-@@ -10799,7 +12388,8 @@ vld3q_lane_u16 (const uint16_t * __a, uint16x8x3_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3q_lane_u32 (const uint32_t * __a, uint32x4x3_t __b, const int __c)
- {
-   union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-@@ -10808,7 +12398,8 @@ vld3q_lane_u32 (const uint32_t * __a, uint32x4x3_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3q_lane_p16 (const poly16_t * __a, poly16x8x3_t __b, const int __c)
- {
-   union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-@@ -10817,7 +12408,8 @@ vld3q_lane_p16 (const poly16_t * __a, poly16x8x3_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_dup_s8 (const int8_t * __a)
- {
-   union { int8x8x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10825,7 +12417,8 @@ vld3_dup_s8 (const int8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_dup_s16 (const int16_t * __a)
- {
-   union { int16x4x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10833,7 +12426,8 @@ vld3_dup_s16 (const int16_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_dup_s32 (const int32_t * __a)
- {
-   union { int32x2x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10842,7 +12436,8 @@ vld3_dup_s32 (const int32_t * __a)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_dup_f16 (const float16_t * __a)
- {
-   union { float16x4x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10851,7 +12446,8 @@ vld3_dup_f16 (const float16_t * __a)
- }
- #endif
- 
--__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_dup_f32 (const float32_t * __a)
- {
-   union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10859,7 +12455,8 @@ vld3_dup_f32 (const float32_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_dup_u8 (const uint8_t * __a)
- {
-   union { uint8x8x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10867,7 +12464,8 @@ vld3_dup_u8 (const uint8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_dup_u16 (const uint16_t * __a)
- {
-   union { uint16x4x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10875,7 +12473,8 @@ vld3_dup_u16 (const uint16_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_dup_u32 (const uint32_t * __a)
- {
-   union { uint32x2x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10883,7 +12482,8 @@ vld3_dup_u32 (const uint32_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_dup_p8 (const poly8_t * __a)
- {
-   union { poly8x8x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10891,7 +12491,8 @@ vld3_dup_p8 (const poly8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_dup_p16 (const poly16_t * __a)
- {
-   union { poly16x4x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10901,7 +12502,8 @@ vld3_dup_p16 (const poly16_t * __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x1x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_dup_p64 (const poly64_t * __a)
- {
-   union { poly64x1x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10910,7 +12512,8 @@ vld3_dup_p64 (const poly64_t * __a)
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_dup_s64 (const int64_t * __a)
- {
-   union { int64x1x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10918,7 +12521,8 @@ vld3_dup_s64 (const int64_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint64x1x3_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1x3_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld3_dup_u64 (const uint64_t * __a)
- {
-   union { uint64x1x3_t __i; __builtin_neon_ei __o; } __rv;
-@@ -10926,21 +12530,24 @@ vld3_dup_u64 (const uint64_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3_s8 (int8_t * __a, int8x8x3_t __b)
- {
-   union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-   __builtin_neon_vst3v8qi ((__builtin_neon_qi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3_s16 (int16_t * __a, int16x4x3_t __b)
- {
-   union { int16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-   __builtin_neon_vst3v4hi ((__builtin_neon_hi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3_s32 (int32_t * __a, int32x2x3_t __b)
- {
-   union { int32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-@@ -10948,7 +12555,8 @@ vst3_s32 (int32_t * __a, int32x2x3_t __b)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3_f16 (float16_t * __a, float16x4x3_t __b)
- {
-   union { float16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-@@ -10956,42 +12564,48 @@ vst3_f16 (float16_t * __a, float16x4x3_t __b)
- }
- #endif
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3_f32 (float32_t * __a, float32x2x3_t __b)
- {
-   union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-   __builtin_neon_vst3v2sf ((__builtin_neon_sf *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3_u8 (uint8_t * __a, uint8x8x3_t __b)
- {
-   union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-   __builtin_neon_vst3v8qi ((__builtin_neon_qi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3_u16 (uint16_t * __a, uint16x4x3_t __b)
- {
-   union { uint16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-   __builtin_neon_vst3v4hi ((__builtin_neon_hi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3_u32 (uint32_t * __a, uint32x2x3_t __b)
- {
-   union { uint32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-   __builtin_neon_vst3v2si ((__builtin_neon_si *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3_p8 (poly8_t * __a, poly8x8x3_t __b)
- {
-   union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-   __builtin_neon_vst3v8qi ((__builtin_neon_qi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3_p16 (poly16_t * __a, poly16x4x3_t __b)
- {
-   union { poly16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-@@ -11000,7 +12614,8 @@ vst3_p16 (poly16_t * __a, poly16x4x3_t __b)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3_p64 (poly64_t * __a, poly64x1x3_t __b)
- {
-   union { poly64x1x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-@@ -11008,35 +12623,40 @@ vst3_p64 (poly64_t * __a, poly64x1x3_t __b)
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3_s64 (int64_t * __a, int64x1x3_t __b)
- {
-   union { int64x1x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-   __builtin_neon_vst3di ((__builtin_neon_di *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3_u64 (uint64_t * __a, uint64x1x3_t __b)
- {
-   union { uint64x1x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-   __builtin_neon_vst3di ((__builtin_neon_di *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3q_s8 (int8_t * __a, int8x16x3_t __b)
- {
-   union { int8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-   __builtin_neon_vst3v16qi ((__builtin_neon_qi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3q_s16 (int16_t * __a, int16x8x3_t __b)
- {
-   union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-   __builtin_neon_vst3v8hi ((__builtin_neon_hi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3q_s32 (int32_t * __a, int32x4x3_t __b)
- {
-   union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-@@ -11044,7 +12664,8 @@ vst3q_s32 (int32_t * __a, int32x4x3_t __b)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3q_f16 (float16_t * __a, float16x8x3_t __b)
- {
-   union { float16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-@@ -11052,63 +12673,72 @@ vst3q_f16 (float16_t * __a, float16x8x3_t __b)
- }
- #endif
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3q_f32 (float32_t * __a, float32x4x3_t __b)
- {
-   union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-   __builtin_neon_vst3v4sf ((__builtin_neon_sf *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3q_u8 (uint8_t * __a, uint8x16x3_t __b)
- {
-   union { uint8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-   __builtin_neon_vst3v16qi ((__builtin_neon_qi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3q_u16 (uint16_t * __a, uint16x8x3_t __b)
- {
-   union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-   __builtin_neon_vst3v8hi ((__builtin_neon_hi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3q_u32 (uint32_t * __a, uint32x4x3_t __b)
- {
-   union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-   __builtin_neon_vst3v4si ((__builtin_neon_si *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3q_p8 (poly8_t * __a, poly8x16x3_t __b)
- {
-   union { poly8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-   __builtin_neon_vst3v16qi ((__builtin_neon_qi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3q_p16 (poly16_t * __a, poly16x8x3_t __b)
- {
-   union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-   __builtin_neon_vst3v8hi ((__builtin_neon_hi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3_lane_s8 (int8_t * __a, int8x8x3_t __b, const int __c)
- {
-   union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-   __builtin_neon_vst3_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3_lane_s16 (int16_t * __a, int16x4x3_t __b, const int __c)
- {
-   union { int16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-   __builtin_neon_vst3_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3_lane_s32 (int32_t * __a, int32x2x3_t __b, const int __c)
- {
-   union { int32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-@@ -11116,7 +12746,8 @@ vst3_lane_s32 (int32_t * __a, int32x2x3_t __b, const int __c)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3_lane_f16 (float16_t * __a, float16x4x3_t __b, const int __c)
- {
-   union { float16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-@@ -11124,56 +12755,64 @@ vst3_lane_f16 (float16_t * __a, float16x4x3_t __b, const int __c)
- }
- #endif
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3_lane_f32 (float32_t * __a, float32x2x3_t __b, const int __c)
- {
-   union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-   __builtin_neon_vst3_lanev2sf ((__builtin_neon_sf *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3_lane_u8 (uint8_t * __a, uint8x8x3_t __b, const int __c)
- {
-   union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-   __builtin_neon_vst3_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3_lane_u16 (uint16_t * __a, uint16x4x3_t __b, const int __c)
- {
-   union { uint16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-   __builtin_neon_vst3_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3_lane_u32 (uint32_t * __a, uint32x2x3_t __b, const int __c)
- {
-   union { uint32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-   __builtin_neon_vst3_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3_lane_p8 (poly8_t * __a, poly8x8x3_t __b, const int __c)
- {
-   union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-   __builtin_neon_vst3_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3_lane_p16 (poly16_t * __a, poly16x4x3_t __b, const int __c)
- {
-   union { poly16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-   __builtin_neon_vst3_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3q_lane_s16 (int16_t * __a, int16x8x3_t __b, const int __c)
- {
-   union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-   __builtin_neon_vst3_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3q_lane_s32 (int32_t * __a, int32x4x3_t __b, const int __c)
- {
-   union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-@@ -11181,7 +12820,8 @@ vst3q_lane_s32 (int32_t * __a, int32x4x3_t __b, const int __c)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3q_lane_f16 (float16_t * __a, float16x8x3_t __b, const int __c)
- {
-   union { float16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-@@ -11189,35 +12829,40 @@ vst3q_lane_f16 (float16_t * __a, float16x8x3_t __b, const int __c)
- }
- #endif
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3q_lane_f32 (float32_t * __a, float32x4x3_t __b, const int __c)
- {
-   union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-   __builtin_neon_vst3_lanev4sf ((__builtin_neon_sf *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3q_lane_u16 (uint16_t * __a, uint16x8x3_t __b, const int __c)
- {
-   union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-   __builtin_neon_vst3_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3q_lane_u32 (uint32_t * __a, uint32x4x3_t __b, const int __c)
- {
-   union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-   __builtin_neon_vst3_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst3q_lane_p16 (poly16_t * __a, poly16x8x3_t __b, const int __c)
- {
-   union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-   __builtin_neon_vst3_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_s8 (const int8_t * __a)
- {
-   union { int8x8x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11225,7 +12870,8 @@ vld4_s8 (const int8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_s16 (const int16_t * __a)
- {
-   union { int16x4x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11233,7 +12879,8 @@ vld4_s16 (const int16_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_s32 (const int32_t * __a)
- {
-   union { int32x2x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11242,7 +12889,8 @@ vld4_s32 (const int32_t * __a)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_f16 (const float16_t * __a)
- {
-   union { float16x4x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11251,7 +12899,8 @@ vld4_f16 (const float16_t * __a)
- }
- #endif
- 
--__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_f32 (const float32_t * __a)
- {
-   union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11259,7 +12908,8 @@ vld4_f32 (const float32_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_u8 (const uint8_t * __a)
- {
-   union { uint8x8x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11267,7 +12917,8 @@ vld4_u8 (const uint8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_u16 (const uint16_t * __a)
- {
-   union { uint16x4x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11275,7 +12926,8 @@ vld4_u16 (const uint16_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_u32 (const uint32_t * __a)
- {
-   union { uint32x2x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11283,7 +12935,8 @@ vld4_u32 (const uint32_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_p8 (const poly8_t * __a)
- {
-   union { poly8x8x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11291,7 +12944,8 @@ vld4_p8 (const poly8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_p16 (const poly16_t * __a)
- {
-   union { poly16x4x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11301,7 +12955,8 @@ vld4_p16 (const poly16_t * __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x1x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_p64 (const poly64_t * __a)
- {
-   union { poly64x1x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11310,7 +12965,8 @@ vld4_p64 (const poly64_t * __a)
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_s64 (const int64_t * __a)
- {
-   union { int64x1x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11318,7 +12974,8 @@ vld4_s64 (const int64_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint64x1x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_u64 (const uint64_t * __a)
- {
-   union { uint64x1x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11326,7 +12983,8 @@ vld4_u64 (const uint64_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int8x16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4q_s8 (const int8_t * __a)
- {
-   union { int8x16x4_t __i; __builtin_neon_xi __o; } __rv;
-@@ -11334,7 +12992,8 @@ vld4q_s8 (const int8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4q_s16 (const int16_t * __a)
- {
-   union { int16x8x4_t __i; __builtin_neon_xi __o; } __rv;
-@@ -11342,7 +13001,8 @@ vld4q_s16 (const int16_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4q_s32 (const int32_t * __a)
- {
-   union { int32x4x4_t __i; __builtin_neon_xi __o; } __rv;
-@@ -11351,7 +13011,8 @@ vld4q_s32 (const int32_t * __a)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4q_f16 (const float16_t * __a)
- {
-   union { float16x8x4_t __i; __builtin_neon_xi __o; } __rv;
-@@ -11360,7 +13021,8 @@ vld4q_f16 (const float16_t * __a)
- }
- #endif
- 
--__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4q_f32 (const float32_t * __a)
- {
-   union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv;
-@@ -11368,7 +13030,8 @@ vld4q_f32 (const float32_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint8x16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4q_u8 (const uint8_t * __a)
- {
-   union { uint8x16x4_t __i; __builtin_neon_xi __o; } __rv;
-@@ -11376,7 +13039,8 @@ vld4q_u8 (const uint8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4q_u16 (const uint16_t * __a)
- {
-   union { uint16x8x4_t __i; __builtin_neon_xi __o; } __rv;
-@@ -11384,7 +13048,8 @@ vld4q_u16 (const uint16_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4q_u32 (const uint32_t * __a)
- {
-   union { uint32x4x4_t __i; __builtin_neon_xi __o; } __rv;
-@@ -11392,7 +13057,8 @@ vld4q_u32 (const uint32_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly8x16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4q_p8 (const poly8_t * __a)
- {
-   union { poly8x16x4_t __i; __builtin_neon_xi __o; } __rv;
-@@ -11400,7 +13066,8 @@ vld4q_p8 (const poly8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4q_p16 (const poly16_t * __a)
- {
-   union { poly16x8x4_t __i; __builtin_neon_xi __o; } __rv;
-@@ -11408,7 +13075,8 @@ vld4q_p16 (const poly16_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_lane_s8 (const int8_t * __a, int8x8x4_t __b, const int __c)
- {
-   union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -11417,7 +13085,8 @@ vld4_lane_s8 (const int8_t * __a, int8x8x4_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_lane_s16 (const int16_t * __a, int16x4x4_t __b, const int __c)
- {
-   union { int16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -11426,7 +13095,8 @@ vld4_lane_s16 (const int16_t * __a, int16x4x4_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_lane_s32 (const int32_t * __a, int32x2x4_t __b, const int __c)
- {
-   union { int32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -11436,7 +13106,8 @@ vld4_lane_s32 (const int32_t * __a, int32x2x4_t __b, const int __c)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_lane_f16 (const float16_t * __a, float16x4x4_t __b, const int __c)
- {
-   union { float16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -11447,7 +13118,8 @@ vld4_lane_f16 (const float16_t * __a, float16x4x4_t __b, const int __c)
- }
- #endif
- 
--__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_lane_f32 (const float32_t * __a, float32x2x4_t __b, const int __c)
- {
-   union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -11456,7 +13128,8 @@ vld4_lane_f32 (const float32_t * __a, float32x2x4_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_lane_u8 (const uint8_t * __a, uint8x8x4_t __b, const int __c)
- {
-   union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -11465,7 +13138,8 @@ vld4_lane_u8 (const uint8_t * __a, uint8x8x4_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_lane_u16 (const uint16_t * __a, uint16x4x4_t __b, const int __c)
- {
-   union { uint16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -11474,7 +13148,8 @@ vld4_lane_u16 (const uint16_t * __a, uint16x4x4_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_lane_u32 (const uint32_t * __a, uint32x2x4_t __b, const int __c)
- {
-   union { uint32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -11483,7 +13158,8 @@ vld4_lane_u32 (const uint32_t * __a, uint32x2x4_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_lane_p8 (const poly8_t * __a, poly8x8x4_t __b, const int __c)
- {
-   union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -11492,7 +13168,8 @@ vld4_lane_p8 (const poly8_t * __a, poly8x8x4_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_lane_p16 (const poly16_t * __a, poly16x4x4_t __b, const int __c)
- {
-   union { poly16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -11501,7 +13178,8 @@ vld4_lane_p16 (const poly16_t * __a, poly16x4x4_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4q_lane_s16 (const int16_t * __a, int16x8x4_t __b, const int __c)
- {
-   union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-@@ -11510,7 +13188,8 @@ vld4q_lane_s16 (const int16_t * __a, int16x8x4_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4q_lane_s32 (const int32_t * __a, int32x4x4_t __b, const int __c)
- {
-   union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-@@ -11520,7 +13199,8 @@ vld4q_lane_s32 (const int32_t * __a, int32x4x4_t __b, const int __c)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4q_lane_f16 (const float16_t * __a, float16x8x4_t __b, const int __c)
- {
-   union { float16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-@@ -11531,7 +13211,8 @@ vld4q_lane_f16 (const float16_t * __a, float16x8x4_t __b, const int __c)
- }
- #endif
- 
--__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4q_lane_f32 (const float32_t * __a, float32x4x4_t __b, const int __c)
- {
-   union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-@@ -11540,7 +13221,8 @@ vld4q_lane_f32 (const float32_t * __a, float32x4x4_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4q_lane_u16 (const uint16_t * __a, uint16x8x4_t __b, const int __c)
- {
-   union { uint16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-@@ -11549,7 +13231,8 @@ vld4q_lane_u16 (const uint16_t * __a, uint16x8x4_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4q_lane_u32 (const uint32_t * __a, uint32x4x4_t __b, const int __c)
- {
-   union { uint32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-@@ -11558,7 +13241,8 @@ vld4q_lane_u32 (const uint32_t * __a, uint32x4x4_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4q_lane_p16 (const poly16_t * __a, poly16x8x4_t __b, const int __c)
- {
-   union { poly16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-@@ -11567,7 +13251,8 @@ vld4q_lane_p16 (const poly16_t * __a, poly16x8x4_t __b, const int __c)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_dup_s8 (const int8_t * __a)
- {
-   union { int8x8x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11575,7 +13260,8 @@ vld4_dup_s8 (const int8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_dup_s16 (const int16_t * __a)
- {
-   union { int16x4x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11583,7 +13269,8 @@ vld4_dup_s16 (const int16_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_dup_s32 (const int32_t * __a)
- {
-   union { int32x2x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11592,7 +13279,8 @@ vld4_dup_s32 (const int32_t * __a)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_dup_f16 (const float16_t * __a)
- {
-   union { float16x4x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11601,7 +13289,8 @@ vld4_dup_f16 (const float16_t * __a)
- }
- #endif
- 
--__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_dup_f32 (const float32_t * __a)
- {
-   union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11609,7 +13298,8 @@ vld4_dup_f32 (const float32_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_dup_u8 (const uint8_t * __a)
- {
-   union { uint8x8x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11617,7 +13307,8 @@ vld4_dup_u8 (const uint8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_dup_u16 (const uint16_t * __a)
- {
-   union { uint16x4x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11625,7 +13316,8 @@ vld4_dup_u16 (const uint16_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_dup_u32 (const uint32_t * __a)
- {
-   union { uint32x2x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11633,7 +13325,8 @@ vld4_dup_u32 (const uint32_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_dup_p8 (const poly8_t * __a)
- {
-   union { poly8x8x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11641,7 +13334,8 @@ vld4_dup_p8 (const poly8_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_dup_p16 (const poly16_t * __a)
- {
-   union { poly16x4x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11651,7 +13345,8 @@ vld4_dup_p16 (const poly16_t * __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x1x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_dup_p64 (const poly64_t * __a)
- {
-   union { poly64x1x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11660,7 +13355,8 @@ vld4_dup_p64 (const poly64_t * __a)
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_dup_s64 (const int64_t * __a)
- {
-   union { int64x1x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11668,7 +13364,8 @@ vld4_dup_s64 (const int64_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline uint64x1x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vld4_dup_u64 (const uint64_t * __a)
- {
-   union { uint64x1x4_t __i; __builtin_neon_oi __o; } __rv;
-@@ -11676,21 +13373,24 @@ vld4_dup_u64 (const uint64_t * __a)
-   return __rv.__i;
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4_s8 (int8_t * __a, int8x8x4_t __b)
- {
-   union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4_s16 (int16_t * __a, int16x4x4_t __b)
- {
-   union { int16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst4v4hi ((__builtin_neon_hi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4_s32 (int32_t * __a, int32x2x4_t __b)
- {
-   union { int32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -11698,7 +13398,8 @@ vst4_s32 (int32_t * __a, int32x2x4_t __b)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4_f16 (float16_t * __a, float16x4x4_t __b)
- {
-   union { float16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -11706,42 +13407,48 @@ vst4_f16 (float16_t * __a, float16x4x4_t __b)
- }
- #endif
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4_f32 (float32_t * __a, float32x2x4_t __b)
- {
-   union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst4v2sf ((__builtin_neon_sf *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4_u8 (uint8_t * __a, uint8x8x4_t __b)
- {
-   union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4_u16 (uint16_t * __a, uint16x4x4_t __b)
- {
-   union { uint16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst4v4hi ((__builtin_neon_hi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4_u32 (uint32_t * __a, uint32x2x4_t __b)
- {
-   union { uint32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst4v2si ((__builtin_neon_si *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4_p8 (poly8_t * __a, poly8x8x4_t __b)
- {
-   union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4_p16 (poly16_t * __a, poly16x4x4_t __b)
- {
-   union { poly16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -11750,7 +13457,8 @@ vst4_p16 (poly16_t * __a, poly16x4x4_t __b)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4_p64 (poly64_t * __a, poly64x1x4_t __b)
- {
-   union { poly64x1x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -11758,35 +13466,40 @@ vst4_p64 (poly64_t * __a, poly64x1x4_t __b)
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4_s64 (int64_t * __a, int64x1x4_t __b)
- {
-   union { int64x1x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst4di ((__builtin_neon_di *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4_u64 (uint64_t * __a, uint64x1x4_t __b)
- {
-   union { uint64x1x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst4di ((__builtin_neon_di *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4q_s8 (int8_t * __a, int8x16x4_t __b)
- {
-   union { int8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-   __builtin_neon_vst4v16qi ((__builtin_neon_qi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4q_s16 (int16_t * __a, int16x8x4_t __b)
- {
-   union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-   __builtin_neon_vst4v8hi ((__builtin_neon_hi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4q_s32 (int32_t * __a, int32x4x4_t __b)
- {
-   union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-@@ -11794,7 +13507,8 @@ vst4q_s32 (int32_t * __a, int32x4x4_t __b)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4q_f16 (float16_t * __a, float16x8x4_t __b)
- {
-   union { float16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-@@ -11802,63 +13516,72 @@ vst4q_f16 (float16_t * __a, float16x8x4_t __b)
- }
- #endif
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4q_f32 (float32_t * __a, float32x4x4_t __b)
- {
-   union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-   __builtin_neon_vst4v4sf ((__builtin_neon_sf *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4q_u8 (uint8_t * __a, uint8x16x4_t __b)
- {
-   union { uint8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-   __builtin_neon_vst4v16qi ((__builtin_neon_qi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4q_u16 (uint16_t * __a, uint16x8x4_t __b)
- {
-   union { uint16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-   __builtin_neon_vst4v8hi ((__builtin_neon_hi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4q_u32 (uint32_t * __a, uint32x4x4_t __b)
- {
-   union { uint32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-   __builtin_neon_vst4v4si ((__builtin_neon_si *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4q_p8 (poly8_t * __a, poly8x16x4_t __b)
- {
-   union { poly8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-   __builtin_neon_vst4v16qi ((__builtin_neon_qi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4q_p16 (poly16_t * __a, poly16x8x4_t __b)
- {
-   union { poly16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-   __builtin_neon_vst4v8hi ((__builtin_neon_hi *) __a, __bu.__o);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4_lane_s8 (int8_t * __a, int8x8x4_t __b, const int __c)
- {
-   union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst4_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4_lane_s16 (int16_t * __a, int16x4x4_t __b, const int __c)
- {
-   union { int16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst4_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4_lane_s32 (int32_t * __a, int32x2x4_t __b, const int __c)
- {
-   union { int32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -11866,7 +13589,8 @@ vst4_lane_s32 (int32_t * __a, int32x2x4_t __b, const int __c)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4_lane_f16 (float16_t * __a, float16x4x4_t __b, const int __c)
- {
-   union { float16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-@@ -11874,56 +13598,64 @@ vst4_lane_f16 (float16_t * __a, float16x4x4_t __b, const int __c)
- }
- #endif
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4_lane_f32 (float32_t * __a, float32x2x4_t __b, const int __c)
- {
-   union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst4_lanev2sf ((__builtin_neon_sf *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4_lane_u8 (uint8_t * __a, uint8x8x4_t __b, const int __c)
- {
-   union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst4_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4_lane_u16 (uint16_t * __a, uint16x4x4_t __b, const int __c)
- {
-   union { uint16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst4_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4_lane_u32 (uint32_t * __a, uint32x2x4_t __b, const int __c)
- {
-   union { uint32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst4_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4_lane_p8 (poly8_t * __a, poly8x8x4_t __b, const int __c)
- {
-   union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst4_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4_lane_p16 (poly16_t * __a, poly16x4x4_t __b, const int __c)
- {
-   union { poly16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-   __builtin_neon_vst4_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4q_lane_s16 (int16_t * __a, int16x8x4_t __b, const int __c)
- {
-   union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-   __builtin_neon_vst4_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4q_lane_s32 (int32_t * __a, int32x4x4_t __b, const int __c)
- {
-   union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-@@ -11931,7 +13663,8 @@ vst4q_lane_s32 (int32_t * __a, int32x4x4_t __b, const int __c)
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4q_lane_f16 (float16_t * __a, float16x8x4_t __b, const int __c)
- {
-   union { float16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-@@ -11939,529 +13672,616 @@ vst4q_lane_f16 (float16_t * __a, float16x8x4_t __b, const int __c)
- }
- #endif
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4q_lane_f32 (float32_t * __a, float32x4x4_t __b, const int __c)
- {
-   union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-   __builtin_neon_vst4_lanev4sf ((__builtin_neon_sf *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4q_lane_u16 (uint16_t * __a, uint16x8x4_t __b, const int __c)
- {
-   union { uint16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-   __builtin_neon_vst4_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4q_lane_u32 (uint32_t * __a, uint32x4x4_t __b, const int __c)
- {
-   union { uint32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-   __builtin_neon_vst4_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vst4q_lane_p16 (poly16_t * __a, poly16x8x4_t __b, const int __c)
- {
-   union { poly16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-   __builtin_neon_vst4_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vand_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vand_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vand_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vand_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vand_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vand_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vand_s64 (int64x1_t __a, int64x1_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vand_u64 (uint64x1_t __a, uint64x1_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vandq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vandq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vandq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vandq_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vandq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vandq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vandq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vandq_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return __a & __b;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vorr_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vorr_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vorr_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vorr_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vorr_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vorr_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vorr_s64 (int64x1_t __a, int64x1_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vorr_u64 (uint64x1_t __a, uint64x1_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vorrq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vorrq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vorrq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vorrq_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vorrq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vorrq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vorrq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vorrq_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return __a | __b;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- veor_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- veor_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- veor_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- veor_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- veor_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- veor_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- veor_s64 (int64x1_t __a, int64x1_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- veor_u64 (uint64x1_t __a, uint64x1_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- veorq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- veorq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- veorq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- veorq_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- veorq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- veorq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- veorq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- veorq_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return __a ^ __b;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbic_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbic_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbic_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbic_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbic_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbic_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbic_s64 (int64x1_t __a, int64x1_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbic_u64 (uint64x1_t __a, uint64x1_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbicq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbicq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbicq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbicq_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbicq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbicq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbicq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vbicq_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return __a & ~__b;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vorn_s8 (int8x8_t __a, int8x8_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vorn_s16 (int16x4_t __a, int16x4_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vorn_s32 (int32x2_t __a, int32x2_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vorn_u8 (uint8x8_t __a, uint8x8_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vorn_u16 (uint16x4_t __a, uint16x4_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vorn_u32 (uint32x2_t __a, uint32x2_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vorn_s64 (int64x1_t __a, int64x1_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vorn_u64 (uint64x1_t __a, uint64x1_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vornq_s8 (int8x16_t __a, int8x16_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vornq_s16 (int16x8_t __a, int16x8_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vornq_s32 (int32x4_t __a, int32x4_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vornq_s64 (int64x2_t __a, int64x2_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vornq_u8 (uint8x16_t __a, uint8x16_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vornq_u16 (uint16x8_t __a, uint16x8_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vornq_u32 (uint32x4_t __a, uint32x4_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vornq_u64 (uint64x2_t __a, uint64x2_t __b)
- {
-   return __a | ~__b;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p8_p16 (poly16x4_t __a)
- {
-   return (poly8x8_t) __a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p8_f16 (float16x4_t __a)
- {
-   return (poly8x8_t) __a;
- }
- #endif
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p8_f32 (float32x2_t __a)
- {
-   return (poly8x8_t)__a;
-@@ -12469,76 +14289,88 @@ vreinterpret_p8_f32 (float32x2_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p8_p64 (poly64x1_t __a)
- {
-   return (poly8x8_t)__a;
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p8_s64 (int64x1_t __a)
- {
-   return (poly8x8_t)__a;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p8_u64 (uint64x1_t __a)
- {
-   return (poly8x8_t)__a;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p8_s8 (int8x8_t __a)
- {
-   return (poly8x8_t)__a;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p8_s16 (int16x4_t __a)
- {
-   return (poly8x8_t)__a;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p8_s32 (int32x2_t __a)
- {
-   return (poly8x8_t)__a;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p8_u8 (uint8x8_t __a)
- {
-   return (poly8x8_t)__a;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p8_u16 (uint16x4_t __a)
- {
-   return (poly8x8_t)__a;
- }
- 
--__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p8_u32 (uint32x2_t __a)
- {
-   return (poly8x8_t)__a;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p16_p8 (poly8x8_t __a)
- {
-   return (poly16x4_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p16_f16 (float16x4_t __a)
- {
-   return (poly16x4_t) __a;
- }
- #endif
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p16_f32 (float32x2_t __a)
- {
-   return (poly16x4_t)__a;
-@@ -12546,63 +14378,73 @@ vreinterpret_p16_f32 (float32x2_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p16_p64 (poly64x1_t __a)
- {
-   return (poly16x4_t)__a;
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p16_s64 (int64x1_t __a)
- {
-   return (poly16x4_t)__a;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p16_u64 (uint64x1_t __a)
- {
-   return (poly16x4_t)__a;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p16_s8 (int8x8_t __a)
- {
-   return (poly16x4_t)__a;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p16_s16 (int16x4_t __a)
- {
-   return (poly16x4_t)__a;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p16_s32 (int32x2_t __a)
- {
-   return (poly16x4_t)__a;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p16_u8 (uint8x8_t __a)
- {
-   return (poly16x4_t)__a;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p16_u16 (uint16x4_t __a)
- {
-   return (poly16x4_t)__a;
- }
- 
--__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p16_u32 (uint32x2_t __a)
- {
-   return (poly16x4_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f16_p8 (poly8x8_t __a)
- {
-   return (float16x4_t) __a;
-@@ -12610,7 +14452,8 @@ vreinterpret_f16_p8 (poly8x8_t __a)
- #endif
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f16_p16 (poly16x4_t __a)
- {
-   return (float16x4_t) __a;
-@@ -12618,7 +14461,8 @@ vreinterpret_f16_p16 (poly16x4_t __a)
- #endif
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f16_f32 (float32x2_t __a)
- {
-   return (float16x4_t) __a;
-@@ -12628,7 +14472,8 @@ vreinterpret_f16_f32 (float32x2_t __a)
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f16_p64 (poly64x1_t __a)
- {
-   return (float16x4_t) __a;
-@@ -12637,7 +14482,8 @@ vreinterpret_f16_p64 (poly64x1_t __a)
- #endif
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f16_s64 (int64x1_t __a)
- {
-   return (float16x4_t) __a;
-@@ -12645,7 +14491,8 @@ vreinterpret_f16_s64 (int64x1_t __a)
- #endif
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f16_u64 (uint64x1_t __a)
- {
-   return (float16x4_t) __a;
-@@ -12653,7 +14500,8 @@ vreinterpret_f16_u64 (uint64x1_t __a)
- #endif
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f16_s8 (int8x8_t __a)
- {
-   return (float16x4_t) __a;
-@@ -12661,7 +14509,8 @@ vreinterpret_f16_s8 (int8x8_t __a)
- #endif
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f16_s16 (int16x4_t __a)
- {
-   return (float16x4_t) __a;
-@@ -12669,7 +14518,8 @@ vreinterpret_f16_s16 (int16x4_t __a)
- #endif
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f16_s32 (int32x2_t __a)
- {
-   return (float16x4_t) __a;
-@@ -12677,7 +14527,8 @@ vreinterpret_f16_s32 (int32x2_t __a)
- #endif
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f16_u8 (uint8x8_t __a)
- {
-   return (float16x4_t) __a;
-@@ -12685,7 +14536,8 @@ vreinterpret_f16_u8 (uint8x8_t __a)
- #endif
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f16_u16 (uint16x4_t __a)
- {
-   return (float16x4_t) __a;
-@@ -12693,27 +14545,31 @@ vreinterpret_f16_u16 (uint16x4_t __a)
- #endif
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f16_u32 (uint32x2_t __a)
- {
-   return (float16x4_t) __a;
- }
- #endif
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f32_p8 (poly8x8_t __a)
- {
-   return (float32x2_t)__a;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f32_p16 (poly16x4_t __a)
- {
-   return (float32x2_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f32_f16 (float16x4_t __a)
- {
-   return (float32x2_t) __a;
-@@ -12722,56 +14578,65 @@ vreinterpret_f32_f16 (float16x4_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f32_p64 (poly64x1_t __a)
- {
-   return (float32x2_t)__a;
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f32_s64 (int64x1_t __a)
- {
-   return (float32x2_t)__a;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f32_u64 (uint64x1_t __a)
- {
-   return (float32x2_t)__a;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f32_s8 (int8x8_t __a)
- {
-   return (float32x2_t)__a;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f32_s16 (int16x4_t __a)
- {
-   return (float32x2_t)__a;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f32_s32 (int32x2_t __a)
- {
-   return (float32x2_t)__a;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f32_u8 (uint8x8_t __a)
- {
-   return (float32x2_t)__a;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f32_u16 (uint16x4_t __a)
- {
-   return (float32x2_t)__a;
- }
- 
--__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_f32_u32 (uint32x2_t __a)
- {
-   return (float32x2_t)__a;
-@@ -12779,102 +14644,118 @@ vreinterpret_f32_u32 (uint32x2_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p64_p8 (poly8x8_t __a)
- {
-   return (poly64x1_t)__a;
- }
- 
--__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p64_p16 (poly16x4_t __a)
- {
-   return (poly64x1_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p64_f16 (float16x4_t __a)
- {
-   return (poly64x1_t) __a;
- }
- #endif
- 
--__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p64_f32 (float32x2_t __a)
- {
-   return (poly64x1_t)__a;
- }
- 
--__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p64_s64 (int64x1_t __a)
- {
-   return (poly64x1_t)__a;
- }
- 
--__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p64_u64 (uint64x1_t __a)
- {
-   return (poly64x1_t)__a;
- }
- 
--__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p64_s8 (int8x8_t __a)
- {
-   return (poly64x1_t)__a;
- }
- 
--__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p64_s16 (int16x4_t __a)
- {
-   return (poly64x1_t)__a;
- }
- 
--__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p64_s32 (int32x2_t __a)
- {
-   return (poly64x1_t)__a;
- }
- 
--__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p64_u8 (uint8x8_t __a)
- {
-   return (poly64x1_t)__a;
- }
- 
--__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p64_u16 (uint16x4_t __a)
- {
-   return (poly64x1_t)__a;
- }
- 
--__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_p64_u32 (uint32x2_t __a)
- {
-   return (poly64x1_t)__a;
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s64_p8 (poly8x8_t __a)
- {
-   return (int64x1_t)__a;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s64_p16 (poly16x4_t __a)
- {
-   return (int64x1_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s64_f16 (float16x4_t __a)
- {
-   return (int64x1_t) __a;
- }
- #endif
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s64_f32 (float32x2_t __a)
- {
-   return (int64x1_t)__a;
-@@ -12882,76 +14763,88 @@ vreinterpret_s64_f32 (float32x2_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s64_p64 (poly64x1_t __a)
- {
-   return (int64x1_t)__a;
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s64_u64 (uint64x1_t __a)
- {
-   return (int64x1_t)__a;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s64_s8 (int8x8_t __a)
- {
-   return (int64x1_t)__a;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s64_s16 (int16x4_t __a)
- {
-   return (int64x1_t)__a;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s64_s32 (int32x2_t __a)
- {
-   return (int64x1_t)__a;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s64_u8 (uint8x8_t __a)
- {
-   return (int64x1_t)__a;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s64_u16 (uint16x4_t __a)
- {
-   return (int64x1_t)__a;
- }
- 
--__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s64_u32 (uint32x2_t __a)
- {
-   return (int64x1_t)__a;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u64_p8 (poly8x8_t __a)
- {
-   return (uint64x1_t)__a;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u64_p16 (poly16x4_t __a)
- {
-   return (uint64x1_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u64_f16 (float16x4_t __a)
- {
-   return (uint64x1_t) __a;
- }
- #endif
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u64_f32 (float32x2_t __a)
- {
-   return (uint64x1_t)__a;
-@@ -12959,76 +14852,88 @@ vreinterpret_u64_f32 (float32x2_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u64_p64 (poly64x1_t __a)
- {
-   return (uint64x1_t)__a;
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u64_s64 (int64x1_t __a)
- {
-   return (uint64x1_t)__a;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u64_s8 (int8x8_t __a)
- {
-   return (uint64x1_t)__a;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u64_s16 (int16x4_t __a)
- {
-   return (uint64x1_t)__a;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u64_s32 (int32x2_t __a)
- {
-   return (uint64x1_t)__a;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u64_u8 (uint8x8_t __a)
- {
-   return (uint64x1_t)__a;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u64_u16 (uint16x4_t __a)
- {
-   return (uint64x1_t)__a;
- }
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u64_u32 (uint32x2_t __a)
- {
-   return (uint64x1_t)__a;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s8_p8 (poly8x8_t __a)
- {
-   return (int8x8_t)__a;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s8_p16 (poly16x4_t __a)
- {
-   return (int8x8_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s8_f16 (float16x4_t __a)
- {
-   return (int8x8_t) __a;
- }
- #endif
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s8_f32 (float32x2_t __a)
- {
-   return (int8x8_t)__a;
-@@ -13036,76 +14941,88 @@ vreinterpret_s8_f32 (float32x2_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s8_p64 (poly64x1_t __a)
- {
-   return (int8x8_t)__a;
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s8_s64 (int64x1_t __a)
- {
-   return (int8x8_t)__a;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s8_u64 (uint64x1_t __a)
- {
-   return (int8x8_t)__a;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s8_s16 (int16x4_t __a)
- {
-   return (int8x8_t)__a;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s8_s32 (int32x2_t __a)
- {
-   return (int8x8_t)__a;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s8_u8 (uint8x8_t __a)
- {
-   return (int8x8_t)__a;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s8_u16 (uint16x4_t __a)
- {
-   return (int8x8_t)__a;
- }
- 
--__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s8_u32 (uint32x2_t __a)
- {
-   return (int8x8_t)__a;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s16_p8 (poly8x8_t __a)
- {
-   return (int16x4_t)__a;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s16_p16 (poly16x4_t __a)
- {
-   return (int16x4_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s16_f16 (float16x4_t __a)
- {
-   return (int16x4_t) __a;
- }
- #endif
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s16_f32 (float32x2_t __a)
- {
-   return (int16x4_t)__a;
-@@ -13113,76 +15030,88 @@ vreinterpret_s16_f32 (float32x2_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s16_p64 (poly64x1_t __a)
- {
-   return (int16x4_t)__a;
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s16_s64 (int64x1_t __a)
- {
-   return (int16x4_t)__a;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s16_u64 (uint64x1_t __a)
- {
-   return (int16x4_t)__a;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s16_s8 (int8x8_t __a)
- {
-   return (int16x4_t)__a;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s16_s32 (int32x2_t __a)
- {
-   return (int16x4_t)__a;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s16_u8 (uint8x8_t __a)
- {
-   return (int16x4_t)__a;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s16_u16 (uint16x4_t __a)
- {
-   return (int16x4_t)__a;
- }
- 
--__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s16_u32 (uint32x2_t __a)
- {
-   return (int16x4_t)__a;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s32_p8 (poly8x8_t __a)
- {
-   return (int32x2_t)__a;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s32_p16 (poly16x4_t __a)
- {
-   return (int32x2_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s32_f16 (float16x4_t __a)
- {
-   return (int32x2_t) __a;
- }
- #endif
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s32_f32 (float32x2_t __a)
- {
-   return (int32x2_t)__a;
-@@ -13190,76 +15119,88 @@ vreinterpret_s32_f32 (float32x2_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s32_p64 (poly64x1_t __a)
- {
-   return (int32x2_t)__a;
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s32_s64 (int64x1_t __a)
- {
-   return (int32x2_t)__a;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s32_u64 (uint64x1_t __a)
- {
-   return (int32x2_t)__a;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s32_s8 (int8x8_t __a)
- {
-   return (int32x2_t)__a;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s32_s16 (int16x4_t __a)
- {
-   return (int32x2_t)__a;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s32_u8 (uint8x8_t __a)
- {
-   return (int32x2_t)__a;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s32_u16 (uint16x4_t __a)
- {
-   return (int32x2_t)__a;
- }
- 
--__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_s32_u32 (uint32x2_t __a)
- {
-   return (int32x2_t)__a;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u8_p8 (poly8x8_t __a)
- {
-   return (uint8x8_t)__a;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u8_p16 (poly16x4_t __a)
- {
-   return (uint8x8_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u8_f16 (float16x4_t __a)
- {
-   return (uint8x8_t) __a;
- }
- #endif
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u8_f32 (float32x2_t __a)
- {
-   return (uint8x8_t)__a;
-@@ -13267,76 +15208,88 @@ vreinterpret_u8_f32 (float32x2_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u8_p64 (poly64x1_t __a)
- {
-   return (uint8x8_t)__a;
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u8_s64 (int64x1_t __a)
- {
-   return (uint8x8_t)__a;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u8_u64 (uint64x1_t __a)
- {
-   return (uint8x8_t)__a;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u8_s8 (int8x8_t __a)
- {
-   return (uint8x8_t)__a;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u8_s16 (int16x4_t __a)
- {
-   return (uint8x8_t)__a;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u8_s32 (int32x2_t __a)
- {
-   return (uint8x8_t)__a;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u8_u16 (uint16x4_t __a)
- {
-   return (uint8x8_t)__a;
- }
- 
--__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u8_u32 (uint32x2_t __a)
- {
-   return (uint8x8_t)__a;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u16_p8 (poly8x8_t __a)
- {
-   return (uint16x4_t)__a;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u16_p16 (poly16x4_t __a)
- {
-   return (uint16x4_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u16_f16 (float16x4_t __a)
- {
-   return (uint16x4_t) __a;
- }
- #endif
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u16_f32 (float32x2_t __a)
- {
-   return (uint16x4_t)__a;
-@@ -13344,76 +15297,88 @@ vreinterpret_u16_f32 (float32x2_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u16_p64 (poly64x1_t __a)
- {
-   return (uint16x4_t)__a;
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u16_s64 (int64x1_t __a)
- {
-   return (uint16x4_t)__a;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u16_u64 (uint64x1_t __a)
- {
-   return (uint16x4_t)__a;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u16_s8 (int8x8_t __a)
- {
-   return (uint16x4_t)__a;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u16_s16 (int16x4_t __a)
- {
-   return (uint16x4_t)__a;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u16_s32 (int32x2_t __a)
- {
-   return (uint16x4_t)__a;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u16_u8 (uint8x8_t __a)
- {
-   return (uint16x4_t)__a;
- }
- 
--__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u16_u32 (uint32x2_t __a)
- {
-   return (uint16x4_t)__a;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u32_p8 (poly8x8_t __a)
- {
-   return (uint32x2_t)__a;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u32_p16 (poly16x4_t __a)
- {
-   return (uint32x2_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u32_f16 (float16x4_t __a)
- {
-   return (uint32x2_t) __a;
- }
- #endif
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u32_f32 (float32x2_t __a)
- {
-   return (uint32x2_t)__a;
-@@ -13421,70 +15386,81 @@ vreinterpret_u32_f32 (float32x2_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u32_p64 (poly64x1_t __a)
- {
-   return (uint32x2_t)__a;
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u32_s64 (int64x1_t __a)
- {
-   return (uint32x2_t)__a;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u32_u64 (uint64x1_t __a)
- {
-   return (uint32x2_t)__a;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u32_s8 (int8x8_t __a)
- {
-   return (uint32x2_t)__a;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u32_s16 (int16x4_t __a)
- {
-   return (uint32x2_t)__a;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u32_s32 (int32x2_t __a)
- {
-   return (uint32x2_t)__a;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u32_u8 (uint8x8_t __a)
- {
-   return (uint32x2_t)__a;
- }
- 
--__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpret_u32_u16 (uint16x4_t __a)
- {
-   return (uint32x2_t)__a;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_p16 (poly16x8_t __a)
- {
-   return (poly8x16_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_f16 (float16x8_t __a)
- {
-   return (poly8x16_t) __a;
- }
- #endif
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_f32 (float32x4_t __a)
- {
-   return (poly8x16_t)__a;
-@@ -13492,83 +15468,96 @@ vreinterpretq_p8_f32 (float32x4_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_p64 (poly64x2_t __a)
- {
-   return (poly8x16_t)__a;
- }
- 
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_p128 (poly128_t __a)
- {
-   return (poly8x16_t)__a;
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_s64 (int64x2_t __a)
- {
-   return (poly8x16_t)__a;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_u64 (uint64x2_t __a)
- {
-   return (poly8x16_t)__a;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_s8 (int8x16_t __a)
- {
-   return (poly8x16_t)__a;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_s16 (int16x8_t __a)
- {
-   return (poly8x16_t)__a;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_s32 (int32x4_t __a)
- {
-   return (poly8x16_t)__a;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_u8 (uint8x16_t __a)
- {
-   return (poly8x16_t)__a;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_u16 (uint16x8_t __a)
- {
-   return (poly8x16_t)__a;
- }
- 
--__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p8_u32 (uint32x4_t __a)
- {
-   return (poly8x16_t)__a;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_p8 (poly8x16_t __a)
- {
-   return (poly16x8_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_f16 (float16x8_t __a)
- {
-   return (poly16x8_t) __a;
- }
- #endif
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_f32 (float32x4_t __a)
- {
-   return (poly16x8_t)__a;
-@@ -13576,69 +15565,80 @@ vreinterpretq_p16_f32 (float32x4_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_p64 (poly64x2_t __a)
- {
-   return (poly16x8_t)__a;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_p128 (poly128_t __a)
- {
-   return (poly16x8_t)__a;
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_s64 (int64x2_t __a)
- {
-   return (poly16x8_t)__a;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_u64 (uint64x2_t __a)
- {
-   return (poly16x8_t)__a;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_s8 (int8x16_t __a)
- {
-   return (poly16x8_t)__a;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_s16 (int16x8_t __a)
- {
-   return (poly16x8_t)__a;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_s32 (int32x4_t __a)
- {
-   return (poly16x8_t)__a;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_u8 (uint8x16_t __a)
- {
-   return (poly16x8_t)__a;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_u16 (uint16x8_t __a)
- {
-   return (poly16x8_t)__a;
- }
- 
--__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p16_u32 (uint32x4_t __a)
- {
-   return (poly16x8_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_p8 (poly8x16_t __a)
- {
-   return (float16x8_t) __a;
-@@ -13646,7 +15646,8 @@ vreinterpretq_f16_p8 (poly8x16_t __a)
- #endif
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_p16 (poly16x8_t __a)
- {
-   return (float16x8_t) __a;
-@@ -13654,7 +15655,8 @@ vreinterpretq_f16_p16 (poly16x8_t __a)
- #endif
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_f32 (float32x4_t __a)
- {
-   return (float16x8_t) __a;
-@@ -13665,7 +15667,8 @@ vreinterpretq_f16_f32 (float32x4_t __a)
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_p64 (poly64x2_t __a)
- {
-   return (float16x8_t) __a;
-@@ -13673,7 +15676,8 @@ vreinterpretq_f16_p64 (poly64x2_t __a)
- #endif
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_p128 (poly128_t __a)
- {
-   return (float16x8_t) __a;
-@@ -13683,7 +15687,8 @@ vreinterpretq_f16_p128 (poly128_t __a)
- #pragma GCC pop_options
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_s64 (int64x2_t __a)
- {
-   return (float16x8_t) __a;
-@@ -13691,7 +15696,8 @@ vreinterpretq_f16_s64 (int64x2_t __a)
- #endif
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_u64 (uint64x2_t __a)
- {
-   return (float16x8_t) __a;
-@@ -13699,7 +15705,8 @@ vreinterpretq_f16_u64 (uint64x2_t __a)
- #endif
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_s8 (int8x16_t __a)
- {
-   return (float16x8_t) __a;
-@@ -13707,7 +15714,8 @@ vreinterpretq_f16_s8 (int8x16_t __a)
- #endif
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_s16 (int16x8_t __a)
- {
-   return (float16x8_t) __a;
-@@ -13715,7 +15723,8 @@ vreinterpretq_f16_s16 (int16x8_t __a)
- #endif
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_s32 (int32x4_t __a)
- {
-   return (float16x8_t) __a;
-@@ -13723,7 +15732,8 @@ vreinterpretq_f16_s32 (int32x4_t __a)
- #endif
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_u8 (uint8x16_t __a)
- {
-   return (float16x8_t) __a;
-@@ -13731,7 +15741,8 @@ vreinterpretq_f16_u8 (uint8x16_t __a)
- #endif
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_u16 (uint16x8_t __a)
- {
-   return (float16x8_t) __a;
-@@ -13739,27 +15750,31 @@ vreinterpretq_f16_u16 (uint16x8_t __a)
- #endif
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f16_u32 (uint32x4_t __a)
- {
-   return (float16x8_t) __a;
- }
- #endif
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_p8 (poly8x16_t __a)
- {
-   return (float32x4_t)__a;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_p16 (poly16x8_t __a)
- {
-   return (float32x4_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_f16 (float16x8_t __a)
- {
-   return (float32x4_t) __a;
-@@ -13768,62 +15783,72 @@ vreinterpretq_f32_f16 (float16x8_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_p64 (poly64x2_t __a)
- {
-   return (float32x4_t)__a;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_p128 (poly128_t __a)
- {
-   return (float32x4_t)__a;
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_s64 (int64x2_t __a)
- {
-   return (float32x4_t)__a;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_u64 (uint64x2_t __a)
- {
-   return (float32x4_t)__a;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_s8 (int8x16_t __a)
- {
-   return (float32x4_t)__a;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_s16 (int16x8_t __a)
- {
-   return (float32x4_t)__a;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_s32 (int32x4_t __a)
- {
-   return (float32x4_t)__a;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_u8 (uint8x16_t __a)
- {
-   return (float32x4_t)__a;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_u16 (uint16x8_t __a)
- {
-   return (float32x4_t)__a;
- }
- 
--__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline float32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_f32_u32 (uint32x4_t __a)
- {
-   return (float32x4_t)__a;
-@@ -13831,188 +15856,218 @@ vreinterpretq_f32_u32 (uint32x4_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p64_p8 (poly8x16_t __a)
- {
-   return (poly64x2_t)__a;
- }
- 
--__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p64_p16 (poly16x8_t __a)
- {
-   return (poly64x2_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p64_f16 (float16x8_t __a)
- {
-   return (poly64x2_t) __a;
- }
- #endif
- 
--__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p64_f32 (float32x4_t __a)
- {
-   return (poly64x2_t)__a;
- }
- 
--__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p64_p128 (poly128_t __a)
- {
-   return (poly64x2_t)__a;
- }
- 
--__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p64_s64 (int64x2_t __a)
- {
-   return (poly64x2_t)__a;
- }
- 
--__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p64_u64 (uint64x2_t __a)
- {
-   return (poly64x2_t)__a;
- }
- 
--__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p64_s8 (int8x16_t __a)
- {
-   return (poly64x2_t)__a;
- }
- 
--__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p64_s16 (int16x8_t __a)
- {
-   return (poly64x2_t)__a;
- }
- 
--__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p64_s32 (int32x4_t __a)
- {
-   return (poly64x2_t)__a;
- }
- 
--__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p64_u8 (uint8x16_t __a)
- {
-   return (poly64x2_t)__a;
- }
- 
--__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p64_u16 (uint16x8_t __a)
- {
-   return (poly64x2_t)__a;
- }
- 
--__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p64_u32 (uint32x4_t __a)
- {
-   return (poly64x2_t)__a;
- }
- 
--__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p128_p8 (poly8x16_t __a)
- {
-   return (poly128_t)__a;
- }
- 
--__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p128_p16 (poly16x8_t __a)
- {
-   return (poly128_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p128_f16 (float16x8_t __a)
- {
-   return (poly128_t) __a;
- }
- #endif
- 
--__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p128_f32 (float32x4_t __a)
- {
-   return (poly128_t)__a;
- }
- 
--__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p128_p64 (poly64x2_t __a)
- {
-   return (poly128_t)__a;
- }
- 
--__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p128_s64 (int64x2_t __a)
- {
-   return (poly128_t)__a;
- }
- 
--__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p128_u64 (uint64x2_t __a)
- {
-   return (poly128_t)__a;
- }
- 
--__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p128_s8 (int8x16_t __a)
- {
-   return (poly128_t)__a;
- }
- 
--__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p128_s16 (int16x8_t __a)
- {
-   return (poly128_t)__a;
- }
- 
--__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p128_s32 (int32x4_t __a)
- {
-   return (poly128_t)__a;
- }
- 
--__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p128_u8 (uint8x16_t __a)
- {
-   return (poly128_t)__a;
- }
- 
--__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p128_u16 (uint16x8_t __a)
- {
-   return (poly128_t)__a;
- }
- 
--__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_p128_u32 (uint32x4_t __a)
- {
-   return (poly128_t)__a;
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_p8 (poly8x16_t __a)
- {
-   return (int64x2_t)__a;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_p16 (poly16x8_t __a)
- {
-   return (int64x2_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_f16 (float16x8_t __a)
- {
-   return (int64x2_t) __a;
- }
- #endif
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_f32 (float32x4_t __a)
- {
-   return (int64x2_t)__a;
-@@ -14020,82 +16075,95 @@ vreinterpretq_s64_f32 (float32x4_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_p64 (poly64x2_t __a)
- {
-   return (int64x2_t)__a;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_p128 (poly128_t __a)
- {
-   return (int64x2_t)__a;
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_u64 (uint64x2_t __a)
- {
-   return (int64x2_t)__a;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_s8 (int8x16_t __a)
- {
-   return (int64x2_t)__a;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_s16 (int16x8_t __a)
- {
-   return (int64x2_t)__a;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_s32 (int32x4_t __a)
- {
-   return (int64x2_t)__a;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_u8 (uint8x16_t __a)
- {
-   return (int64x2_t)__a;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_u16 (uint16x8_t __a)
- {
-   return (int64x2_t)__a;
- }
- 
--__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s64_u32 (uint32x4_t __a)
- {
-   return (int64x2_t)__a;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_p8 (poly8x16_t __a)
- {
-   return (uint64x2_t)__a;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_p16 (poly16x8_t __a)
- {
-   return (uint64x2_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_f16 (float16x8_t __a)
- {
-   return (uint64x2_t) __a;
- }
- #endif
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_f32 (float32x4_t __a)
- {
-   return (uint64x2_t)__a;
-@@ -14103,82 +16171,95 @@ vreinterpretq_u64_f32 (float32x4_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_p64 (poly64x2_t __a)
- {
-   return (uint64x2_t)__a;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_p128 (poly128_t __a)
- {
-   return (uint64x2_t)__a;
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_s64 (int64x2_t __a)
- {
-   return (uint64x2_t)__a;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_s8 (int8x16_t __a)
- {
-   return (uint64x2_t)__a;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_s16 (int16x8_t __a)
- {
-   return (uint64x2_t)__a;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_s32 (int32x4_t __a)
- {
-   return (uint64x2_t)__a;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_u8 (uint8x16_t __a)
- {
-   return (uint64x2_t)__a;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_u16 (uint16x8_t __a)
- {
-   return (uint64x2_t)__a;
- }
- 
--__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u64_u32 (uint32x4_t __a)
- {
-   return (uint64x2_t)__a;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_p8 (poly8x16_t __a)
- {
-   return (int8x16_t)__a;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_p16 (poly16x8_t __a)
- {
-   return (int8x16_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_f16 (float16x8_t __a)
- {
-   return (int8x16_t) __a;
- }
- #endif
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_f32 (float32x4_t __a)
- {
-   return (int8x16_t)__a;
-@@ -14186,82 +16267,95 @@ vreinterpretq_s8_f32 (float32x4_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_p64 (poly64x2_t __a)
- {
-   return (int8x16_t)__a;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_p128 (poly128_t __a)
- {
-   return (int8x16_t)__a;
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_s64 (int64x2_t __a)
- {
-   return (int8x16_t)__a;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_u64 (uint64x2_t __a)
- {
-   return (int8x16_t)__a;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_s16 (int16x8_t __a)
- {
-   return (int8x16_t)__a;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_s32 (int32x4_t __a)
- {
-   return (int8x16_t)__a;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_u8 (uint8x16_t __a)
- {
-   return (int8x16_t)__a;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_u16 (uint16x8_t __a)
- {
-   return (int8x16_t)__a;
- }
- 
--__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s8_u32 (uint32x4_t __a)
- {
-   return (int8x16_t)__a;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_p8 (poly8x16_t __a)
- {
-   return (int16x8_t)__a;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_p16 (poly16x8_t __a)
- {
-   return (int16x8_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_f16 (float16x8_t __a)
- {
-   return (int16x8_t) __a;
- }
- #endif
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_f32 (float32x4_t __a)
- {
-   return (int16x8_t)__a;
-@@ -14269,82 +16363,95 @@ vreinterpretq_s16_f32 (float32x4_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_p64 (poly64x2_t __a)
- {
-   return (int16x8_t)__a;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_p128 (poly128_t __a)
- {
-   return (int16x8_t)__a;
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_s64 (int64x2_t __a)
- {
-   return (int16x8_t)__a;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_u64 (uint64x2_t __a)
- {
-   return (int16x8_t)__a;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_s8 (int8x16_t __a)
- {
-   return (int16x8_t)__a;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_s32 (int32x4_t __a)
- {
-   return (int16x8_t)__a;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_u8 (uint8x16_t __a)
- {
-   return (int16x8_t)__a;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_u16 (uint16x8_t __a)
- {
-   return (int16x8_t)__a;
- }
- 
--__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s16_u32 (uint32x4_t __a)
- {
-   return (int16x8_t)__a;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_p8 (poly8x16_t __a)
- {
-   return (int32x4_t)__a;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_p16 (poly16x8_t __a)
- {
-   return (int32x4_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_f16 (float16x8_t __a)
- {
-   return (int32x4_t)__a;
- }
- #endif
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_f32 (float32x4_t __a)
- {
-   return (int32x4_t)__a;
-@@ -14352,82 +16459,95 @@ vreinterpretq_s32_f32 (float32x4_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_p64 (poly64x2_t __a)
- {
-   return (int32x4_t)__a;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_p128 (poly128_t __a)
- {
-   return (int32x4_t)__a;
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_s64 (int64x2_t __a)
- {
-   return (int32x4_t)__a;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_u64 (uint64x2_t __a)
- {
-   return (int32x4_t)__a;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_s8 (int8x16_t __a)
- {
-   return (int32x4_t)__a;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_s16 (int16x8_t __a)
- {
-   return (int32x4_t)__a;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_u8 (uint8x16_t __a)
- {
-   return (int32x4_t)__a;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_u16 (uint16x8_t __a)
- {
-   return (int32x4_t)__a;
- }
- 
--__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline int32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_s32_u32 (uint32x4_t __a)
- {
-   return (int32x4_t)__a;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_p8 (poly8x16_t __a)
- {
-   return (uint8x16_t)__a;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_p16 (poly16x8_t __a)
- {
-   return (uint8x16_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_f16 (float16x8_t __a)
- {
-   return (uint8x16_t) __a;
- }
- #endif
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_f32 (float32x4_t __a)
- {
-   return (uint8x16_t)__a;
-@@ -14435,82 +16555,95 @@ vreinterpretq_u8_f32 (float32x4_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_p64 (poly64x2_t __a)
- {
-   return (uint8x16_t)__a;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_p128 (poly128_t __a)
- {
-   return (uint8x16_t)__a;
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_s64 (int64x2_t __a)
- {
-   return (uint8x16_t)__a;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_u64 (uint64x2_t __a)
- {
-   return (uint8x16_t)__a;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_s8 (int8x16_t __a)
- {
-   return (uint8x16_t)__a;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_s16 (int16x8_t __a)
- {
-   return (uint8x16_t)__a;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_s32 (int32x4_t __a)
- {
-   return (uint8x16_t)__a;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_u16 (uint16x8_t __a)
- {
-   return (uint8x16_t)__a;
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u8_u32 (uint32x4_t __a)
- {
-   return (uint8x16_t)__a;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_p8 (poly8x16_t __a)
- {
-   return (uint16x8_t)__a;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_p16 (poly16x8_t __a)
- {
-   return (uint16x8_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_f16 (float16x8_t __a)
- {
-   return (uint16x8_t) __a;
- }
- #endif
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_f32 (float32x4_t __a)
- {
-   return (uint16x8_t)__a;
-@@ -14518,82 +16651,95 @@ vreinterpretq_u16_f32 (float32x4_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_p64 (poly64x2_t __a)
- {
-   return (uint16x8_t)__a;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_p128 (poly128_t __a)
- {
-   return (uint16x8_t)__a;
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_s64 (int64x2_t __a)
- {
-   return (uint16x8_t)__a;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_u64 (uint64x2_t __a)
- {
-   return (uint16x8_t)__a;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_s8 (int8x16_t __a)
- {
-   return (uint16x8_t)__a;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_s16 (int16x8_t __a)
- {
-   return (uint16x8_t)__a;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_s32 (int32x4_t __a)
- {
-   return (uint16x8_t)__a;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_u8 (uint8x16_t __a)
- {
-   return (uint16x8_t)__a;
- }
- 
--__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u16_u32 (uint32x4_t __a)
- {
-   return (uint16x8_t)__a;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_p8 (poly8x16_t __a)
- {
-   return (uint32x4_t)__a;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_p16 (poly16x8_t __a)
- {
-   return (uint32x4_t)__a;
- }
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_f16 (float16x8_t __a)
- {
-   return (uint32x4_t) __a;
- }
- #endif
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_f32 (float32x4_t __a)
- {
-   return (uint32x4_t)__a;
-@@ -14601,56 +16747,65 @@ vreinterpretq_u32_f32 (float32x4_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_p64 (poly64x2_t __a)
- {
-   return (uint32x4_t)__a;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_p128 (poly128_t __a)
- {
-   return (uint32x4_t)__a;
- }
- 
- #pragma GCC pop_options
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_s64 (int64x2_t __a)
- {
-   return (uint32x4_t)__a;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_u64 (uint64x2_t __a)
- {
-   return (uint32x4_t)__a;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_s8 (int8x16_t __a)
- {
-   return (uint32x4_t)__a;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_s16 (int16x8_t __a)
- {
-   return (uint32x4_t)__a;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_s32 (int32x4_t __a)
- {
-   return (uint32x4_t)__a;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_u8 (uint8x16_t __a)
- {
-   return (uint32x4_t)__a;
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vreinterpretq_u32_u16 (uint16x8_t __a)
- {
-   return (uint32x4_t)__a;
-@@ -14659,7 +16814,8 @@ vreinterpretq_u32_u16 (uint16x8_t __a)
- 
- #pragma GCC push_options
- #pragma GCC target ("fpu=crypto-neon-fp-armv8")
--__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vldrq_p128 (poly128_t const * __ptr)
- {
- #ifdef __ARM_BIG_ENDIAN
-@@ -14672,7 +16828,8 @@ vldrq_p128 (poly128_t const * __ptr)
- #endif
- }
- 
--__extension__ static __inline void __attribute__ ((__always_inline__))
-+__extension__ extern __inline void
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vstrq_p128 (poly128_t * __ptr, poly128_t __val)
- {
- #ifdef __ARM_BIG_ENDIAN
-@@ -14695,7 +16852,8 @@ vstrq_p128 (poly128_t * __ptr, poly128_t __val)
-    If the result is all zeroes for any half then the whole result is zeroes.
-    This is what the pairwise min reduction achieves.  */
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vceq_p64 (poly64x1_t __a, poly64x1_t __b)
- {
-   uint32x2_t __t_a = vreinterpret_u32_p64 (__a);
-@@ -14710,7 +16868,8 @@ vceq_p64 (poly64x1_t __a, poly64x1_t __b)
-    a reduction with max since if any two corresponding bits
-    in the two poly64_t's match, then the whole result must be all ones.  */
- 
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint64x1_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vtst_p64 (poly64x1_t __a, poly64x1_t __b)
- {
-   uint32x2_t __t_a = vreinterpret_u32_p64 (__a);
-@@ -14720,31 +16879,36 @@ vtst_p64 (poly64x1_t __a, poly64x1_t __b)
-   return vreinterpret_u64_u32 (__m);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaeseq_u8 (uint8x16_t __data, uint8x16_t __key)
- {
-   return __builtin_arm_crypto_aese (__data, __key);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaesdq_u8 (uint8x16_t __data, uint8x16_t __key)
- {
-   return __builtin_arm_crypto_aesd (__data, __key);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaesmcq_u8 (uint8x16_t __data)
- {
-   return __builtin_arm_crypto_aesmc (__data);
- }
- 
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint8x16_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vaesimcq_u8 (uint8x16_t __data)
- {
-   return __builtin_arm_crypto_aesimc (__data);
- }
- 
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsha1h_u32 (uint32_t __hash_e)
- {
-   uint32x4_t __t = vdupq_n_u32 (0);
-@@ -14753,7 +16917,8 @@ vsha1h_u32 (uint32_t __hash_e)
-   return vgetq_lane_u32 (__t, 0);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsha1cq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
- {
-   uint32x4_t __t = vdupq_n_u32 (0);
-@@ -14761,7 +16926,8 @@ vsha1cq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
-   return __builtin_arm_crypto_sha1c (__hash_abcd, __t, __wk);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsha1pq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
- {
-   uint32x4_t __t = vdupq_n_u32 (0);
-@@ -14769,7 +16935,8 @@ vsha1pq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
-   return __builtin_arm_crypto_sha1p (__hash_abcd, __t, __wk);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsha1mq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
- {
-   uint32x4_t __t = vdupq_n_u32 (0);
-@@ -14777,49 +16944,57 @@ vsha1mq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
-   return __builtin_arm_crypto_sha1m (__hash_abcd, __t, __wk);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsha1su0q_u32 (uint32x4_t __w0_3, uint32x4_t __w4_7, uint32x4_t __w8_11)
- {
-   return __builtin_arm_crypto_sha1su0 (__w0_3, __w4_7, __w8_11);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsha1su1q_u32 (uint32x4_t __tw0_3, uint32x4_t __w12_15)
- {
-   return __builtin_arm_crypto_sha1su1 (__tw0_3, __w12_15);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsha256hq_u32 (uint32x4_t __hash_abcd, uint32x4_t __hash_efgh, uint32x4_t __wk)
- {
-   return __builtin_arm_crypto_sha256h (__hash_abcd, __hash_efgh, __wk);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsha256h2q_u32 (uint32x4_t __hash_abcd, uint32x4_t __hash_efgh, uint32x4_t __wk)
- {
-   return __builtin_arm_crypto_sha256h2 (__hash_abcd, __hash_efgh, __wk);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsha256su0q_u32 (uint32x4_t __w0_3, uint32x4_t __w4_7)
- {
-   return __builtin_arm_crypto_sha256su0 (__w0_3, __w4_7);
- }
- 
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline uint32x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vsha256su1q_u32 (uint32x4_t __tw0_3, uint32x4_t __w8_11, uint32x4_t __w12_15)
- {
-   return __builtin_arm_crypto_sha256su1 (__tw0_3, __w8_11, __w12_15);
- }
- 
--__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmull_p64 (poly64_t __a, poly64_t __b)
- {
-   return (poly128_t) __builtin_arm_crypto_vmullp64 ((uint64_t) __a, (uint64_t) __b);
- }
- 
--__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
-+__extension__ extern __inline poly128_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
- vmull_high_p64 (poly64x2_t __a, poly64x2_t __b)
- {
-   poly64_t __t1 = vget_high_p64 (__a);
-@@ -14830,6 +17005,984 @@ vmull_high_p64 (poly64x2_t __a, poly64x2_t __b)
- 
- #pragma GCC pop_options
- 
-+  /* Intrinsics for FP16 instructions.  */
-+#pragma GCC push_options
-+#pragma GCC target ("fpu=neon-fp-armv8")
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vabd_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  return __builtin_neon_vabdv4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vabdq_f16 (float16x8_t __a, float16x8_t __b)
-+{
-+  return __builtin_neon_vabdv8hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vabs_f16 (float16x4_t __a)
-+{
-+  return __builtin_neon_vabsv4hf (__a);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vabsq_f16 (float16x8_t __a)
-+{
-+  return __builtin_neon_vabsv8hf (__a);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vadd_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  return __builtin_neon_vaddv4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vaddq_f16 (float16x8_t __a, float16x8_t __b)
-+{
-+  return __builtin_neon_vaddv8hf (__a, __b);
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcage_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  return (uint16x4_t)__builtin_neon_vcagev4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcageq_f16 (float16x8_t __a, float16x8_t __b)
-+{
-+  return (uint16x8_t)__builtin_neon_vcagev8hf (__a, __b);
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcagt_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  return (uint16x4_t)__builtin_neon_vcagtv4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcagtq_f16 (float16x8_t __a, float16x8_t __b)
-+{
-+  return (uint16x8_t)__builtin_neon_vcagtv8hf (__a, __b);
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcale_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  return (uint16x4_t)__builtin_neon_vcalev4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcaleq_f16 (float16x8_t __a, float16x8_t __b)
-+{
-+  return (uint16x8_t)__builtin_neon_vcalev8hf (__a, __b);
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcalt_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  return (uint16x4_t)__builtin_neon_vcaltv4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcaltq_f16 (float16x8_t __a, float16x8_t __b)
-+{
-+  return (uint16x8_t)__builtin_neon_vcaltv8hf (__a, __b);
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vceq_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  return (uint16x4_t)__builtin_neon_vceqv4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqq_f16 (float16x8_t __a, float16x8_t __b)
-+{
-+  return (uint16x8_t)__builtin_neon_vceqv8hf (__a, __b);
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqz_f16 (float16x4_t __a)
-+{
-+  return (uint16x4_t)__builtin_neon_vceqzv4hf (__a);
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vceqzq_f16 (float16x8_t __a)
-+{
-+  return (uint16x8_t)__builtin_neon_vceqzv8hf (__a);
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcge_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  return (uint16x4_t)__builtin_neon_vcgev4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgeq_f16 (float16x8_t __a, float16x8_t __b)
-+{
-+  return (uint16x8_t)__builtin_neon_vcgev8hf (__a, __b);
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgez_f16 (float16x4_t __a)
-+{
-+  return (uint16x4_t)__builtin_neon_vcgezv4hf (__a);
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgezq_f16 (float16x8_t __a)
-+{
-+  return (uint16x8_t)__builtin_neon_vcgezv8hf (__a);
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgt_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  return (uint16x4_t)__builtin_neon_vcgtv4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtq_f16 (float16x8_t __a, float16x8_t __b)
-+{
-+  return (uint16x8_t)__builtin_neon_vcgtv8hf (__a, __b);
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtz_f16 (float16x4_t __a)
-+{
-+  return (uint16x4_t)__builtin_neon_vcgtzv4hf (__a);
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcgtzq_f16 (float16x8_t __a)
-+{
-+  return (uint16x8_t)__builtin_neon_vcgtzv8hf (__a);
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcle_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  return (uint16x4_t)__builtin_neon_vclev4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcleq_f16 (float16x8_t __a, float16x8_t __b)
-+{
-+  return (uint16x8_t)__builtin_neon_vclev8hf (__a, __b);
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vclez_f16 (float16x4_t __a)
-+{
-+  return (uint16x4_t)__builtin_neon_vclezv4hf (__a);
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vclezq_f16 (float16x8_t __a)
-+{
-+  return (uint16x8_t)__builtin_neon_vclezv8hf (__a);
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vclt_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  return (uint16x4_t)__builtin_neon_vcltv4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltq_f16 (float16x8_t __a, float16x8_t __b)
-+{
-+  return (uint16x8_t)__builtin_neon_vcltv8hf (__a, __b);
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltz_f16 (float16x4_t __a)
-+{
-+  return (uint16x4_t)__builtin_neon_vcltzv4hf (__a);
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcltzq_f16 (float16x8_t __a)
-+{
-+  return (uint16x8_t)__builtin_neon_vcltzv8hf (__a);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_f16_s16 (int16x4_t __a)
-+{
-+  return (float16x4_t)__builtin_neon_vcvtsv4hi (__a);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_f16_u16 (uint16x4_t __a)
-+{
-+  return (float16x4_t)__builtin_neon_vcvtuv4hi ((int16x4_t)__a);
-+}
-+
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_s16_f16 (float16x4_t __a)
-+{
-+  return (int16x4_t)__builtin_neon_vcvtsv4hf (__a);
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_u16_f16 (float16x4_t __a)
-+{
-+  return (uint16x4_t)__builtin_neon_vcvtuv4hf (__a);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_f16_s16 (int16x8_t __a)
-+{
-+  return (float16x8_t)__builtin_neon_vcvtsv8hi (__a);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_f16_u16 (uint16x8_t __a)
-+{
-+  return (float16x8_t)__builtin_neon_vcvtuv8hi ((int16x8_t)__a);
-+}
-+
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_s16_f16 (float16x8_t __a)
-+{
-+  return (int16x8_t)__builtin_neon_vcvtsv8hf (__a);
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_u16_f16 (float16x8_t __a)
-+{
-+  return (uint16x8_t)__builtin_neon_vcvtuv8hf (__a);
-+}
-+
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvta_s16_f16 (float16x4_t __a)
-+{
-+  return __builtin_neon_vcvtasv4hf (__a);
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvta_u16_f16 (float16x4_t __a)
-+{
-+  return (uint16x4_t)__builtin_neon_vcvtauv4hf (__a);
-+}
-+
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtaq_s16_f16 (float16x8_t __a)
-+{
-+  return __builtin_neon_vcvtasv8hf (__a);
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtaq_u16_f16 (float16x8_t __a)
-+{
-+  return (uint16x8_t)__builtin_neon_vcvtauv8hf (__a);
-+}
-+
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtm_s16_f16 (float16x4_t __a)
-+{
-+  return __builtin_neon_vcvtmsv4hf (__a);
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtm_u16_f16 (float16x4_t __a)
-+{
-+  return (uint16x4_t)__builtin_neon_vcvtmuv4hf (__a);
-+}
-+
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtmq_s16_f16 (float16x8_t __a)
-+{
-+  return __builtin_neon_vcvtmsv8hf (__a);
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtmq_u16_f16 (float16x8_t __a)
-+{
-+  return (uint16x8_t)__builtin_neon_vcvtmuv8hf (__a);
-+}
-+
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtn_s16_f16 (float16x4_t __a)
-+{
-+  return __builtin_neon_vcvtnsv4hf (__a);
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtn_u16_f16 (float16x4_t __a)
-+{
-+  return (uint16x4_t)__builtin_neon_vcvtnuv4hf (__a);
-+}
-+
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtnq_s16_f16 (float16x8_t __a)
-+{
-+  return __builtin_neon_vcvtnsv8hf (__a);
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtnq_u16_f16 (float16x8_t __a)
-+{
-+  return (uint16x8_t)__builtin_neon_vcvtnuv8hf (__a);
-+}
-+
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtp_s16_f16 (float16x4_t __a)
-+{
-+  return __builtin_neon_vcvtpsv4hf (__a);
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtp_u16_f16 (float16x4_t __a)
-+{
-+  return (uint16x4_t)__builtin_neon_vcvtpuv4hf (__a);
-+}
-+
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtpq_s16_f16 (float16x8_t __a)
-+{
-+  return __builtin_neon_vcvtpsv8hf (__a);
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtpq_u16_f16 (float16x8_t __a)
-+{
-+  return (uint16x8_t)__builtin_neon_vcvtpuv8hf (__a);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_n_f16_s16 (int16x4_t __a, const int __b)
-+{
-+  return __builtin_neon_vcvts_nv4hi (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_n_f16_u16 (uint16x4_t __a, const int __b)
-+{
-+  return __builtin_neon_vcvtu_nv4hi ((int16x4_t)__a, __b);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_n_f16_s16 (int16x8_t __a, const int __b)
-+{
-+  return __builtin_neon_vcvts_nv8hi (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_n_f16_u16 (uint16x8_t __a, const int __b)
-+{
-+  return __builtin_neon_vcvtu_nv8hi ((int16x8_t)__a, __b);
-+}
-+
-+__extension__ extern __inline int16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_n_s16_f16 (float16x4_t __a, const int __b)
-+{
-+  return __builtin_neon_vcvts_nv4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline uint16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvt_n_u16_f16 (float16x4_t __a, const int __b)
-+{
-+  return (uint16x4_t)__builtin_neon_vcvtu_nv4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline int16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_n_s16_f16 (float16x8_t __a, const int __b)
-+{
-+  return __builtin_neon_vcvts_nv8hf (__a, __b);
-+}
-+
-+__extension__ extern __inline uint16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vcvtq_n_u16_f16 (float16x8_t __a, const int __b)
-+{
-+  return (uint16x8_t)__builtin_neon_vcvtu_nv8hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vfma_f16 (float16x4_t __a, float16x4_t __b, float16x4_t __c)
-+{
-+  return __builtin_neon_vfmav4hf (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmaq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
-+{
-+  return __builtin_neon_vfmav8hf (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vfms_f16 (float16x4_t __a, float16x4_t __b, float16x4_t __c)
-+{
-+  return __builtin_neon_vfmsv4hf (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vfmsq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
-+{
-+  return __builtin_neon_vfmsv8hf (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vmax_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  return __builtin_neon_vmaxfv4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxq_f16 (float16x8_t __a, float16x8_t __b)
-+{
-+  return __builtin_neon_vmaxfv8hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxnm_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  return __builtin_neon_vmaxnmv4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vmaxnmq_f16 (float16x8_t __a, float16x8_t __b)
-+{
-+  return __builtin_neon_vmaxnmv8hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vmin_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  return __builtin_neon_vminfv4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vminq_f16 (float16x8_t __a, float16x8_t __b)
-+{
-+  return __builtin_neon_vminfv8hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vminnm_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  return __builtin_neon_vminnmv4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vminnmq_f16 (float16x8_t __a, float16x8_t __b)
-+{
-+  return __builtin_neon_vminnmv8hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  return __builtin_neon_vmulfv4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_lane_f16 (float16x4_t __a, float16x4_t __b, const int __c)
-+{
-+  return __builtin_neon_vmul_lanev4hf (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vmul_n_f16 (float16x4_t __a, float16_t __b)
-+{
-+  return __builtin_neon_vmul_nv4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_f16 (float16x8_t __a, float16x8_t __b)
-+{
-+  return __builtin_neon_vmulfv8hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_lane_f16 (float16x8_t __a, float16x4_t __b, const int __c)
-+{
-+  return __builtin_neon_vmul_lanev8hf (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vmulq_n_f16 (float16x8_t __a, float16_t __b)
-+{
-+  return __builtin_neon_vmul_nv8hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vneg_f16 (float16x4_t __a)
-+{
-+  return __builtin_neon_vnegv4hf (__a);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vnegq_f16 (float16x8_t __a)
-+{
-+  return __builtin_neon_vnegv8hf (__a);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vpadd_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  return __builtin_neon_vpaddv4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmax_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  return __builtin_neon_vpmaxfv4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vpmin_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  return __builtin_neon_vpminfv4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vrecpe_f16 (float16x4_t __a)
-+{
-+  return __builtin_neon_vrecpev4hf (__a);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vrecpeq_f16 (float16x8_t __a)
-+{
-+  return __builtin_neon_vrecpev8hf (__a);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vrnd_f16 (float16x4_t __a)
-+{
-+  return __builtin_neon_vrndv4hf (__a);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndq_f16 (float16x8_t __a)
-+{
-+  return __builtin_neon_vrndv8hf (__a);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vrnda_f16 (float16x4_t __a)
-+{
-+  return __builtin_neon_vrndav4hf (__a);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndaq_f16 (float16x8_t __a)
-+{
-+  return __builtin_neon_vrndav8hf (__a);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndm_f16 (float16x4_t __a)
-+{
-+  return __builtin_neon_vrndmv4hf (__a);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndmq_f16 (float16x8_t __a)
-+{
-+  return __builtin_neon_vrndmv8hf (__a);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndn_f16 (float16x4_t __a)
-+{
-+  return __builtin_neon_vrndnv4hf (__a);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndnq_f16 (float16x8_t __a)
-+{
-+  return __builtin_neon_vrndnv8hf (__a);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndp_f16 (float16x4_t __a)
-+{
-+  return __builtin_neon_vrndpv4hf (__a);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndpq_f16 (float16x8_t __a)
-+{
-+  return __builtin_neon_vrndpv8hf (__a);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndx_f16 (float16x4_t __a)
-+{
-+  return __builtin_neon_vrndxv4hf (__a);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vrndxq_f16 (float16x8_t __a)
-+{
-+  return __builtin_neon_vrndxv8hf (__a);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsqrte_f16 (float16x4_t __a)
-+{
-+  return __builtin_neon_vrsqrtev4hf (__a);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsqrteq_f16 (float16x8_t __a)
-+{
-+  return __builtin_neon_vrsqrtev8hf (__a);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vrecps_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  return __builtin_neon_vrecpsv4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vrecpsq_f16 (float16x8_t __a, float16x8_t __b)
-+{
-+  return __builtin_neon_vrecpsv8hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsqrts_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  return __builtin_neon_vrsqrtsv4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vrsqrtsq_f16 (float16x8_t __a, float16x8_t __b)
-+{
-+  return __builtin_neon_vrsqrtsv8hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vsub_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  return __builtin_neon_vsubv4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vsubq_f16 (float16x8_t __a, float16x8_t __b)
-+{
-+  return __builtin_neon_vsubv8hf (__a, __b);
-+}
-+
-+#endif /* __ARM_FEATURE_VECTOR_FP16_ARITHMETIC.  */
-+#pragma GCC pop_options
-+
-+  /* Half-precision data processing intrinsics.  */
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vbsl_f16 (uint16x4_t __a, float16x4_t __b, float16x4_t __c)
-+{
-+  return __builtin_neon_vbslv4hf ((int16x4_t)__a, __b, __c);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vbslq_f16 (uint16x8_t __a, float16x8_t __b, float16x8_t __c)
-+{
-+  return __builtin_neon_vbslv8hf ((int16x8_t)__a, __b, __c);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_n_f16 (float16_t __a)
-+{
-+  return __builtin_neon_vdup_nv4hf (__a);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_n_f16 (float16_t __a)
-+{
-+  return __builtin_neon_vdup_nv8hf (__a);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vdup_lane_f16 (float16x4_t __a, const int __b)
-+{
-+  return __builtin_neon_vdup_lanev4hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vdupq_lane_f16 (float16x4_t __a, const int __b)
-+{
-+  return __builtin_neon_vdup_lanev8hf (__a, __b);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vext_f16 (float16x4_t __a, float16x4_t __b, const int __c)
-+{
-+  return __builtin_neon_vextv4hf (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vextq_f16 (float16x8_t __a, float16x8_t __b, const int __c)
-+{
-+  return __builtin_neon_vextv8hf (__a, __b, __c);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vmov_n_f16 (float16_t __a)
-+{
-+  return __builtin_neon_vdup_nv4hf (__a);
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vmovq_n_f16 (float16_t __a)
-+{
-+  return __builtin_neon_vdup_nv8hf (__a);
-+}
-+
-+__extension__ extern __inline float16x4_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev64_f16 (float16x4_t __a)
-+{
-+  return (float16x4_t)__builtin_shuffle (__a, (uint16x4_t){ 3, 2, 1, 0 });
-+}
-+
-+__extension__ extern __inline float16x8_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vrev64q_f16 (float16x8_t __a)
-+{
-+  return
-+    (float16x8_t)__builtin_shuffle (__a,
-+				    (uint16x8_t){ 3, 2, 1, 0, 7, 6, 5, 4 });
-+}
-+
-+__extension__ extern __inline float16x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrn_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  float16x4x2_t __rv;
-+#ifdef __ARM_BIG_ENDIAN
-+  __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x4_t){ 5, 1, 7, 3 });
-+  __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x4_t){ 4, 0, 6, 2 });
-+#else
-+  __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x4_t){ 0, 4, 2, 6 });
-+  __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x4_t){ 1, 5, 3, 7 });
-+#endif
-+  return __rv;
-+}
-+
-+__extension__ extern __inline float16x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vtrnq_f16 (float16x8_t __a, float16x8_t __b)
-+{
-+  float16x8x2_t __rv;
-+#ifdef __ARM_BIG_ENDIAN
-+  __rv.val[0] = __builtin_shuffle (__a, __b,
-+				   (uint16x8_t){ 9, 1, 11, 3, 13, 5, 15, 7 });
-+  __rv.val[1] = __builtin_shuffle (__a, __b,
-+				   (uint16x8_t){ 8, 0, 10, 2, 12, 4, 14, 6 });
-+#else
-+  __rv.val[0] = __builtin_shuffle (__a, __b,
-+				   (uint16x8_t){ 0, 8, 2, 10, 4, 12, 6, 14 });
-+  __rv.val[1] = __builtin_shuffle (__a, __b,
-+				   (uint16x8_t){ 1, 9, 3, 11, 5, 13, 7, 15 });
-+#endif
-+  return __rv;
-+}
-+
-+__extension__ extern __inline float16x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzp_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  float16x4x2_t __rv;
-+#ifdef __ARM_BIG_ENDIAN
-+  __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x4_t){ 5, 7, 1, 3 });
-+  __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x4_t){ 4, 6, 0, 2 });
-+#else
-+  __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x4_t){ 0, 2, 4, 6 });
-+  __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x4_t){ 1, 3, 5, 7 });
-+#endif
-+  return __rv;
-+}
-+
-+__extension__ extern __inline float16x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vuzpq_f16 (float16x8_t __a, float16x8_t __b)
-+{
-+  float16x8x2_t __rv;
-+#ifdef __ARM_BIG_ENDIAN
-+  __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
-+				   { 5, 7, 1, 3, 13, 15, 9, 11 });
-+  __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
-+				   { 4, 6, 0, 2, 12, 14, 8, 10 });
-+#else
-+  __rv.val[0] = __builtin_shuffle (__a, __b,
-+				   (uint16x8_t){ 0, 2, 4, 6, 8, 10, 12, 14 });
-+  __rv.val[1] = __builtin_shuffle (__a, __b,
-+				   (uint16x8_t){ 1, 3, 5, 7, 9, 11, 13, 15 });
-+#endif
-+  return __rv;
-+}
-+
-+__extension__ extern __inline float16x4x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vzip_f16 (float16x4_t __a, float16x4_t __b)
-+{
-+  float16x4x2_t __rv;
-+#ifdef __ARM_BIG_ENDIAN
-+  __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x4_t){ 6, 2, 7, 3 });
-+  __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x4_t){ 4, 0, 5, 1 });
-+#else
-+  __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x4_t){ 0, 4, 1, 5 });
-+  __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x4_t){ 2, 6, 3, 7 });
-+#endif
-+  return __rv;
-+}
-+
-+__extension__ extern __inline float16x8x2_t
-+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
-+vzipq_f16 (float16x8_t __a, float16x8_t __b)
-+{
-+  float16x8x2_t __rv;
-+#ifdef __ARM_BIG_ENDIAN
-+  __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
-+				   { 10, 2, 11, 3, 8, 0, 9, 1 });
-+  __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
-+				   { 14, 6, 15, 7, 12, 4, 13, 5 });
-+#else
-+  __rv.val[0] = __builtin_shuffle (__a, __b,
-+				   (uint16x8_t){ 0, 8, 1, 9, 2, 10, 3, 11 });
-+  __rv.val[1] = __builtin_shuffle (__a, __b,
-+				   (uint16x8_t){ 4, 12, 5, 13, 6, 14, 7, 15 });
-+#endif
-+  return __rv;
-+}
-+
-+#endif
-+
- #ifdef __cplusplus
- }
- #endif
---- a/src/gcc/config/arm/arm_neon_builtins.def
-+++ b/src/gcc/config/arm/arm_neon_builtins.def
-@@ -19,6 +19,7 @@
-    <http://www.gnu.org/licenses/>.  */
- 
- VAR2 (BINOP, vadd, v2sf, v4sf)
-+VAR2 (BINOP, vadd, v8hf, v4hf)
- VAR3 (BINOP, vaddls, v8qi, v4hi, v2si)
- VAR3 (BINOP, vaddlu, v8qi, v4hi, v2si)
- VAR3 (BINOP, vaddws, v8qi, v4hi, v2si)
-@@ -32,12 +33,15 @@ VAR8 (BINOP, vqaddu, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
- VAR3 (BINOP, vaddhn, v8hi, v4si, v2di)
- VAR3 (BINOP, vraddhn, v8hi, v4si, v2di)
- VAR2 (BINOP, vmulf, v2sf, v4sf)
-+VAR2 (BINOP, vmulf, v8hf, v4hf)
- VAR2 (BINOP, vmulp, v8qi, v16qi)
- VAR8 (TERNOP, vmla, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf)
- VAR3 (TERNOP, vmlals, v8qi, v4hi, v2si)
- VAR3 (TERNOP, vmlalu, v8qi, v4hi, v2si)
- VAR2 (TERNOP, vfma, v2sf, v4sf)
-+VAR2 (TERNOP, vfma, v4hf, v8hf)
- VAR2 (TERNOP, vfms, v2sf, v4sf)
-+VAR2 (TERNOP, vfms, v4hf, v8hf)
- VAR8 (TERNOP, vmls, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf)
- VAR3 (TERNOP, vmlsls, v8qi, v4hi, v2si)
- VAR3 (TERNOP, vmlslu, v8qi, v4hi, v2si)
-@@ -94,6 +98,7 @@ VAR8 (TERNOP_IMM, vsrau_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
- VAR8 (TERNOP_IMM, vrsras_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
- VAR8 (TERNOP_IMM, vrsrau_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
- VAR2 (BINOP, vsub, v2sf, v4sf)
-+VAR2 (BINOP, vsub, v8hf, v4hf)
- VAR3 (BINOP, vsubls, v8qi, v4hi, v2si)
- VAR3 (BINOP, vsublu, v8qi, v4hi, v2si)
- VAR3 (BINOP, vsubws, v8qi, v4hi, v2si)
-@@ -111,12 +116,27 @@ VAR8 (BINOP, vcgt, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf)
- VAR6 (BINOP, vcgtu, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
- VAR2 (BINOP, vcage, v2sf, v4sf)
- VAR2 (BINOP, vcagt, v2sf, v4sf)
-+VAR2 (BINOP, vcage, v4hf, v8hf)
-+VAR2 (BINOP, vcagt, v4hf, v8hf)
-+VAR2 (BINOP, vcale, v4hf, v8hf)
-+VAR2 (BINOP, vcalt, v4hf, v8hf)
-+VAR2 (BINOP, vceq, v4hf, v8hf)
-+VAR2 (BINOP, vcge, v4hf, v8hf)
-+VAR2 (BINOP, vcgt, v4hf, v8hf)
-+VAR2 (BINOP, vcle, v4hf, v8hf)
-+VAR2 (BINOP, vclt, v4hf, v8hf)
-+VAR2 (UNOP, vceqz, v4hf, v8hf)
-+VAR2 (UNOP, vcgez, v4hf, v8hf)
-+VAR2 (UNOP, vcgtz, v4hf, v8hf)
-+VAR2 (UNOP, vclez, v4hf, v8hf)
-+VAR2 (UNOP, vcltz, v4hf, v8hf)
- VAR6 (BINOP, vtst, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
- VAR6 (BINOP, vabds, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
- VAR6 (BINOP, vabdu, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
- VAR2 (BINOP, vabdf, v2sf, v4sf)
- VAR3 (BINOP, vabdls, v8qi, v4hi, v2si)
- VAR3 (BINOP, vabdlu, v8qi, v4hi, v2si)
-+VAR2 (BINOP, vabd, v8hf, v4hf)
- 
- VAR6 (TERNOP, vabas, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
- VAR6 (TERNOP, vabau, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
-@@ -126,27 +146,38 @@ VAR3 (TERNOP, vabalu, v8qi, v4hi, v2si)
- VAR6 (BINOP, vmaxs, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
- VAR6 (BINOP, vmaxu, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
- VAR2 (BINOP, vmaxf, v2sf, v4sf)
-+VAR2 (BINOP, vmaxf, v8hf, v4hf)
-+VAR4 (BINOP, vmaxnm, v2sf, v4sf, v4hf, v8hf)
- VAR6 (BINOP, vmins, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
- VAR6 (BINOP, vminu, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
- VAR2 (BINOP, vminf, v2sf, v4sf)
-+VAR2 (BINOP, vminf, v4hf, v8hf)
-+VAR4 (BINOP, vminnm, v2sf, v4sf, v8hf, v4hf)
- 
- VAR3 (BINOP, vpmaxs, v8qi, v4hi, v2si)
- VAR3 (BINOP, vpmaxu, v8qi, v4hi, v2si)
- VAR1 (BINOP, vpmaxf, v2sf)
-+VAR1 (BINOP, vpmaxf, v4hf)
- VAR3 (BINOP, vpmins, v8qi, v4hi, v2si)
- VAR3 (BINOP, vpminu, v8qi, v4hi, v2si)
- VAR1 (BINOP, vpminf, v2sf)
-+VAR1 (BINOP, vpminf, v4hf)
- 
- VAR4 (BINOP, vpadd, v8qi, v4hi, v2si, v2sf)
-+VAR1 (BINOP, vpadd, v4hf)
- VAR6 (UNOP, vpaddls, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
- VAR6 (UNOP, vpaddlu, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
- VAR6 (BINOP, vpadals, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
- VAR6 (BINOP, vpadalu, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
- VAR2 (BINOP, vrecps, v2sf, v4sf)
- VAR2 (BINOP, vrsqrts, v2sf, v4sf)
-+VAR2 (BINOP, vrecps, v4hf, v8hf)
-+VAR2 (BINOP, vrsqrts, v4hf, v8hf)
- VAR8 (TERNOP_IMM, vsri_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
- VAR8 (TERNOP_IMM, vsli_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)
- VAR8 (UNOP, vabs, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf)
-+VAR2 (UNOP, vabs, v8hf, v4hf)
-+VAR2 (UNOP, vneg, v8hf, v4hf)
- VAR6 (UNOP, vqabs, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
- VAR8 (UNOP, vneg, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf)
- VAR6 (UNOP, vqneg, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
-@@ -155,8 +186,16 @@ VAR6 (UNOP, vclz, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
- VAR5 (BSWAP, bswap, v4hi, v8hi, v2si, v4si, v2di)
- VAR2 (UNOP, vcnt, v8qi, v16qi)
- VAR4 (UNOP, vrecpe, v2si, v2sf, v4si, v4sf)
-+VAR2 (UNOP, vrecpe, v8hf, v4hf)
- VAR4 (UNOP, vrsqrte, v2si, v2sf, v4si, v4sf)
-+VAR2 (UNOP, vrsqrte, v4hf, v8hf)
- VAR6 (UNOP, vmvn, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
-+VAR2 (UNOP, vrnd, v8hf, v4hf)
-+VAR2 (UNOP, vrnda, v8hf, v4hf)
-+VAR2 (UNOP, vrndm, v8hf, v4hf)
-+VAR2 (UNOP, vrndn, v8hf, v4hf)
-+VAR2 (UNOP, vrndp, v8hf, v4hf)
-+VAR2 (UNOP, vrndx, v8hf, v4hf)
-   /* FIXME: vget_lane supports more variants than this!  */
- VAR10 (GETLANE, vget_lane,
- 	 v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
-@@ -166,8 +205,10 @@ VAR10 (SETLANE, vset_lane,
- VAR5 (UNOP, vcreate, v8qi, v4hi, v2si, v2sf, di)
- VAR10 (UNOP, vdup_n,
- 	 v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
-+VAR2 (UNOP, vdup_n, v8hf, v4hf)
- VAR10 (GETLANE, vdup_lane,
- 	 v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
-+VAR2 (GETLANE, vdup_lane, v8hf, v4hf)
- VAR6 (COMBINE, vcombine, v8qi, v4hi, v4hf, v2si, v2sf, di)
- VAR6 (UNOP, vget_high, v16qi, v8hi, v8hf, v4si, v4sf, v2di)
- VAR6 (UNOP, vget_low, v16qi, v8hi, v8hf, v4si, v4sf, v2di)
-@@ -177,7 +218,7 @@ VAR3 (UNOP, vqmovnu, v8hi, v4si, v2di)
- VAR3 (UNOP, vqmovun, v8hi, v4si, v2di)
- VAR3 (UNOP, vmovls, v8qi, v4hi, v2si)
- VAR3 (UNOP, vmovlu, v8qi, v4hi, v2si)
--VAR6 (SETLANE, vmul_lane, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
-+VAR8 (SETLANE, vmul_lane, v4hi, v2si, v2sf, v8hi, v4si, v4sf, v4hf, v8hf)
- VAR6 (MAC_LANE, vmla_lane, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
- VAR2 (MAC_LANE, vmlals_lane, v4hi, v2si)
- VAR2 (MAC_LANE, vmlalu_lane, v4hi, v2si)
-@@ -186,7 +227,7 @@ VAR6 (MAC_LANE, vmls_lane, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
- VAR2 (MAC_LANE, vmlsls_lane, v4hi, v2si)
- VAR2 (MAC_LANE, vmlslu_lane, v4hi, v2si)
- VAR2 (MAC_LANE, vqdmlsl_lane, v4hi, v2si)
--VAR6 (BINOP, vmul_n, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
-+VAR8 (BINOP, vmul_n, v4hi, v2si, v2sf, v8hi, v4si, v4sf, v4hf, v8hf)
- VAR6 (MAC_N, vmla_n, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
- VAR2 (MAC_N, vmlals_n, v4hi, v2si)
- VAR2 (MAC_N, vmlalu_n, v4hi, v2si)
-@@ -197,17 +238,27 @@ VAR2 (MAC_N, vmlslu_n, v4hi, v2si)
- VAR2 (MAC_N, vqdmlsl_n, v4hi, v2si)
- VAR10 (SETLANE, vext,
- 	 v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
-+VAR2 (SETLANE, vext, v8hf, v4hf)
- VAR8 (UNOP, vrev64, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf)
- VAR4 (UNOP, vrev32, v8qi, v4hi, v16qi, v8hi)
- VAR2 (UNOP, vrev16, v8qi, v16qi)
- VAR4 (UNOP, vcvts, v2si, v2sf, v4si, v4sf)
-+VAR2 (UNOP, vcvts, v4hi, v8hi)
-+VAR2 (UNOP, vcvts, v4hf, v8hf)
-+VAR2 (UNOP, vcvtu, v4hi, v8hi)
-+VAR2 (UNOP, vcvtu, v4hf, v8hf)
- VAR4 (UNOP, vcvtu, v2si, v2sf, v4si, v4sf)
- VAR4 (BINOP, vcvts_n, v2si, v2sf, v4si, v4sf)
- VAR4 (BINOP, vcvtu_n, v2si, v2sf, v4si, v4sf)
-+VAR2 (BINOP, vcvts_n, v4hf, v8hf)
-+VAR2 (BINOP, vcvtu_n, v4hi, v8hi)
-+VAR2 (BINOP, vcvts_n, v4hi, v8hi)
-+VAR2 (BINOP, vcvtu_n, v4hf, v8hf)
- VAR1 (UNOP, vcvtv4sf, v4hf)
- VAR1 (UNOP, vcvtv4hf, v4sf)
- VAR10 (TERNOP, vbsl,
- 	 v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
-+VAR2 (TERNOP, vbsl, v8hf, v4hf)
- VAR2 (UNOP, copysignf, v2sf, v4sf)
- VAR2 (UNOP, vrintn, v2sf, v4sf)
- VAR2 (UNOP, vrinta, v2sf, v4sf)
-@@ -219,6 +270,14 @@ VAR1 (UNOP, vcvtav2sf, v2si)
- VAR1 (UNOP, vcvtav4sf, v4si)
- VAR1 (UNOP, vcvtauv2sf, v2si)
- VAR1 (UNOP, vcvtauv4sf, v4si)
-+VAR2 (UNOP, vcvtas, v4hf, v8hf)
-+VAR2 (UNOP, vcvtau, v4hf, v8hf)
-+VAR2 (UNOP, vcvtms, v4hf, v8hf)
-+VAR2 (UNOP, vcvtmu, v4hf, v8hf)
-+VAR2 (UNOP, vcvtns, v4hf, v8hf)
-+VAR2 (UNOP, vcvtnu, v4hf, v8hf)
-+VAR2 (UNOP, vcvtps, v4hf, v8hf)
-+VAR2 (UNOP, vcvtpu, v4hf, v8hf)
- VAR1 (UNOP, vcvtpv2sf, v2si)
- VAR1 (UNOP, vcvtpv4sf, v4si)
- VAR1 (UNOP, vcvtpuv2sf, v2si)
---- /dev/null
-+++ b/src/gcc/config/arm/arm_vfp_builtins.def
-@@ -0,0 +1,51 @@
-+/* VFP instruction builtin definitions.
-+   Copyright (C) 2016 Free Software Foundation, Inc.
-+   Contributed by ARM Ltd.
-+   This file is part of GCC.
-+
-+   GCC is free software; you can redistribute it and/or modify it
-+   under the terms of the GNU General Public License as published
-+   by the Free Software Foundation; either version 3, or (at your
-+   option) any later version.
-+
-+   GCC is distributed in the hope that it will be useful, but WITHOUT
-+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-+   License for more details.
-+
-+   You should have received a copy of the GNU General Public License
-+   along with GCC; see the file COPYING3.  If not see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+/* This file lists the builtins that may be available when VFP is enabled but
-+   not NEON is enabled.  The entries otherwise have the same requirements and
-+   generate the same structures as those in the arm_neon_builtins.def.  */
-+
-+/* FP16 Arithmetic instructions.  */
-+VAR1 (UNOP, vabs, hf)
-+VAR2 (UNOP, vcvths, hf, si)
-+VAR2 (UNOP, vcvthu, hf, si)
-+VAR1 (UNOP, vcvtahs, si)
-+VAR1 (UNOP, vcvtahu, si)
-+VAR1 (UNOP, vcvtmhs, si)
-+VAR1 (UNOP, vcvtmhu, si)
-+VAR1 (UNOP, vcvtnhs, si)
-+VAR1 (UNOP, vcvtnhu, si)
-+VAR1 (UNOP, vcvtphs, si)
-+VAR1 (UNOP, vcvtphu, si)
-+VAR1 (UNOP, vrnd, hf)
-+VAR1 (UNOP, vrnda, hf)
-+VAR1 (UNOP, vrndi, hf)
-+VAR1 (UNOP, vrndm, hf)
-+VAR1 (UNOP, vrndn, hf)
-+VAR1 (UNOP, vrndp, hf)
-+VAR1 (UNOP, vrndx, hf)
-+VAR1 (UNOP, vsqrt, hf)
-+
-+VAR2 (BINOP, vcvths_n, hf, si)
-+VAR2 (BINOP, vcvthu_n, hf, si)
-+VAR1 (BINOP, vmaxnm, hf)
-+VAR1 (BINOP, vminnm, hf)
-+
-+VAR1 (TERNOP, vfma, hf)
-+VAR1 (TERNOP, vfms, hf)
---- a/src/gcc/config/arm/bpabi.h
-+++ b/src/gcc/config/arm/bpabi.h
-@@ -75,6 +75,9 @@
-    |mcpu=cortex-a57.cortex-a53				\
-    |mcpu=cortex-a72					\
-    |mcpu=cortex-a72.cortex-a53				\
-+   |mcpu=cortex-a73					\
-+   |mcpu=cortex-a73.cortex-a35				\
-+   |mcpu=cortex-a73.cortex-a53				\
-    |mcpu=exynos-m1                                      \
-    |mcpu=qdf24xx					\
-    |mcpu=xgene1                                         \
-@@ -90,6 +93,11 @@
-    |march=armv8-a+crc					\
-    |march=armv8.1-a					\
-    |march=armv8.1-a+crc					\
-+   |march=armv8.2-a					\
-+   |march=armv8.2-a+fp16				\
-+   |march=armv8-m.base|mcpu=cortex-m23			\
-+   |march=armv8-m.main					\
-+   |march=armv8-m.main+dsp|mcpu=cortex-m33		\
-    :%{!r:--be8}}}"
- #else
- #define BE8_LINK_SPEC \
-@@ -105,6 +113,9 @@
-    |mcpu=cortex-a57.cortex-a53				\
-    |mcpu=cortex-a72					\
-    |mcpu=cortex-a72.cortex-a53				\
-+   |mcpu=cortex-a73					\
-+   |mcpu=cortex-a73.cortex-a35				\
-+   |mcpu=cortex-a73.cortex-a53				\
-    |mcpu=exynos-m1                                      \
-    |mcpu=qdf24xx					\
-    |mcpu=xgene1                                         \
-@@ -121,6 +132,11 @@
-    |march=armv8-a+crc					\
-    |march=armv8.1-a					\
-    |march=armv8.1-a+crc					\
-+   |march=armv8.2-a					\
-+   |march=armv8.2-a+fp16				\
-+   |march=armv8-m.base|mcpu=cortex-m23			\
-+   |march=armv8-m.main					\
-+   |march=armv8-m.main+dsp|mcpu=cortex-m33		\
-    :%{!r:--be8}}}"
- #endif
- 
---- a/src/gcc/config/arm/constraints.md
-+++ b/src/gcc/config/arm/constraints.md
-@@ -34,11 +34,13 @@
- ;; in ARM/Thumb-2 state: Da, Db, Dc, Dd, Dn, Dl, DL, Do, Dv, Dy, Di, Dt, Dp, Dz
- ;; in Thumb-1 state: Pa, Pb, Pc, Pd, Pe
- ;; in Thumb-2 state: Pj, PJ, Ps, Pt, Pu, Pv, Pw, Px, Py
-+;; in all states: Pf
- 
- ;; The following memory constraints have been used:
--;; in ARM/Thumb-2 state: Q, Uh, Ut, Uv, Uy, Un, Um, Us
-+;; in ARM/Thumb-2 state: Uh, Ut, Uv, Uy, Un, Um, Us
- ;; in ARM state: Uq
- ;; in Thumb state: Uu, Uw
-+;; in all states: Q
- 
- 
- (define_register_constraint "t" "TARGET_32BIT ? VFP_LO_REGS : NO_REGS"
-@@ -66,7 +68,7 @@
- 
- (define_constraint "j"
-  "A constant suitable for a MOVW instruction. (ARM/Thumb-2)"
-- (and (match_test "TARGET_32BIT && arm_arch_thumb2")
-+ (and (match_test "TARGET_HAVE_MOVT")
-       (ior (and (match_code "high")
- 		(match_test "arm_valid_symbolic_address_p (XEXP (op, 0))"))
- 	   (and (match_code "const_int")
-@@ -180,6 +182,13 @@
-   (and (match_code "const_int")
-        (match_test "TARGET_THUMB1 && ival >= 256 && ival <= 510")))
- 
-+(define_constraint "Pf"
-+  "Memory models except relaxed, consume or release ones."
-+  (and (match_code "const_int")
-+       (match_test "!is_mm_relaxed (memmodel_from_int (ival))
-+		    && !is_mm_consume (memmodel_from_int (ival))
-+		    && !is_mm_release (memmodel_from_int (ival))")))
-+
- (define_constraint "Ps"
-   "@internal In Thumb-2 state a constant in the range -255 to +255"
-   (and (match_code "const_int")
-@@ -333,13 +342,13 @@
-  "@internal
-   In ARM/ Thumb2 a const_double which can be used with a vcvt.f32.s32 with fract bits operation"
-   (and (match_code "const_double")
--       (match_test "TARGET_32BIT && TARGET_VFP && vfp3_const_double_for_fract_bits (op)")))
-+       (match_test "TARGET_32BIT && vfp3_const_double_for_fract_bits (op)")))
- 
- (define_constraint "Dp"
-  "@internal
-   In ARM/ Thumb2 a const_double which can be used with a vcvt.s32.f32 with bits operation"
-   (and (match_code "const_double")
--       (match_test "TARGET_32BIT && TARGET_VFP
-+       (match_test "TARGET_32BIT
- 		    && vfp3_const_double_for_bits (op) > 0")))
- 
- (define_register_constraint "Ts" "(arm_restrict_it) ? LO_REGS : GENERAL_REGS"
-@@ -407,7 +416,7 @@
- 
- (define_memory_constraint "Q"
-  "@internal
--  In ARM/Thumb-2 state an address that is a single base register."
-+  An address that is a single base register."
-  (and (match_code "mem")
-       (match_test "REG_P (XEXP (op, 0))")))
- 
---- a/src/gcc/config/arm/cortex-a53.md
-+++ b/src/gcc/config/arm/cortex-a53.md
-@@ -30,6 +30,7 @@
- 
- (define_cpu_unit "cortex_a53_slot0" "cortex_a53")
- (define_cpu_unit "cortex_a53_slot1" "cortex_a53")
-+(final_presence_set "cortex_a53_slot1" "cortex_a53_slot0")
- 
- (define_reservation "cortex_a53_slot_any"
- 		    "cortex_a53_slot0\
-@@ -71,41 +72,43 @@
- 
- (define_insn_reservation "cortex_a53_shift" 2
-   (and (eq_attr "tune" "cortexa53")
--       (eq_attr "type" "adr,shift_imm,shift_reg,mov_imm,mvn_imm"))
-+       (eq_attr "type" "adr,shift_imm,mov_imm,mvn_imm,mov_shift"))
-   "cortex_a53_slot_any")
- 
--(define_insn_reservation "cortex_a53_alu_rotate_imm" 2
-+(define_insn_reservation "cortex_a53_shift_reg" 2
-   (and (eq_attr "tune" "cortexa53")
--       (eq_attr "type" "rotate_imm"))
--  "(cortex_a53_slot1)
--   | (cortex_a53_single_issue)")
-+       (eq_attr "type" "shift_reg,mov_shift_reg"))
-+  "cortex_a53_slot_any+cortex_a53_hazard")
- 
- (define_insn_reservation "cortex_a53_alu" 3
-   (and (eq_attr "tune" "cortexa53")
-        (eq_attr "type" "alu_imm,alus_imm,logic_imm,logics_imm,
- 			alu_sreg,alus_sreg,logic_reg,logics_reg,
- 			adc_imm,adcs_imm,adc_reg,adcs_reg,
--			bfm,csel,clz,rbit,rev,alu_dsp_reg,
--			mov_reg,mvn_reg,
--			mrs,multiple,no_insn"))
-+			csel,clz,rbit,rev,alu_dsp_reg,
-+			mov_reg,mvn_reg,mrs,multiple,no_insn"))
-   "cortex_a53_slot_any")
- 
- (define_insn_reservation "cortex_a53_alu_shift" 3
-   (and (eq_attr "tune" "cortexa53")
-        (eq_attr "type" "alu_shift_imm,alus_shift_imm,
- 			crc,logic_shift_imm,logics_shift_imm,
--			alu_ext,alus_ext,
--			extend,mov_shift,mvn_shift"))
-+			alu_ext,alus_ext,bfm,bfx,extend,mvn_shift"))
-   "cortex_a53_slot_any")
- 
- (define_insn_reservation "cortex_a53_alu_shift_reg" 3
-   (and (eq_attr "tune" "cortexa53")
-        (eq_attr "type" "alu_shift_reg,alus_shift_reg,
- 			logic_shift_reg,logics_shift_reg,
--			mov_shift_reg,mvn_shift_reg"))
-+			mvn_shift_reg"))
-   "cortex_a53_slot_any+cortex_a53_hazard")
- 
--(define_insn_reservation "cortex_a53_mul" 3
-+(define_insn_reservation "cortex_a53_alu_extr" 3
-+  (and (eq_attr "tune" "cortexa53")
-+       (eq_attr "type" "rotate_imm"))
-+  "cortex_a53_slot1|cortex_a53_single_issue")
-+
-+(define_insn_reservation "cortex_a53_mul" 4
-   (and (eq_attr "tune" "cortexa53")
-        (ior (eq_attr "mul32" "yes")
- 	    (eq_attr "mul64" "yes")))
-@@ -189,49 +192,43 @@
- (define_insn_reservation "cortex_a53_branch" 0
-   (and (eq_attr "tune" "cortexa53")
-        (eq_attr "type" "branch,call"))
--  "cortex_a53_slot_any,cortex_a53_branch")
-+  "cortex_a53_slot_any+cortex_a53_branch")
- 
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ;; General-purpose register bypasses
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- 
--;; Model bypasses for unshifted operands to ALU instructions.
-+;; Model bypasses for ALU to ALU instructions.
-+
-+(define_bypass 0 "cortex_a53_shift*"
-+		 "cortex_a53_alu")
- 
--(define_bypass 1 "cortex_a53_shift"
--		 "cortex_a53_shift")
-+(define_bypass 1 "cortex_a53_shift*"
-+		 "cortex_a53_shift*,cortex_a53_alu_*")
- 
--(define_bypass 1 "cortex_a53_alu,
--		  cortex_a53_alu_shift*,
--		  cortex_a53_alu_rotate_imm,
--		  cortex_a53_shift"
-+(define_bypass 1 "cortex_a53_alu*"
- 		 "cortex_a53_alu")
- 
--(define_bypass 2 "cortex_a53_alu,
--		  cortex_a53_alu_shift*"
-+(define_bypass 1 "cortex_a53_alu*"
- 		 "cortex_a53_alu_shift*"
- 		 "aarch_forward_to_shift_is_not_shifted_reg")
- 
--;; In our model, we allow any general-purpose register operation to
--;; bypass to the accumulator operand of an integer MADD-like operation.
-+(define_bypass 2 "cortex_a53_alu*"
-+		 "cortex_a53_alu_*,cortex_a53_shift*")
-+
-+;; Model a bypass from MUL/MLA to MLA instructions.
- 
--(define_bypass 1 "cortex_a53_alu*,
--		  cortex_a53_load*,
--		  cortex_a53_mul"
-+(define_bypass 1 "cortex_a53_mul"
- 		 "cortex_a53_mul"
- 		 "aarch_accumulator_forwarding")
- 
--;; Model a bypass from MLA/MUL to many ALU instructions.
-+;; Model a bypass from MUL/MLA to ALU instructions.
- 
- (define_bypass 2 "cortex_a53_mul"
--		 "cortex_a53_alu,
--		  cortex_a53_alu_shift*")
--
--;; We get neater schedules by allowing an MLA/MUL to feed an
--;; early load address dependency to a load.
-+		 "cortex_a53_alu")
- 
--(define_bypass 2 "cortex_a53_mul"
--		 "cortex_a53_load*"
--		 "arm_early_load_addr_dep")
-+(define_bypass 3 "cortex_a53_mul"
-+		 "cortex_a53_alu_*,cortex_a53_shift*")
- 
- ;; Model bypasses for loads which are to be consumed by the ALU.
- 
-@@ -239,47 +236,46 @@
- 		 "cortex_a53_alu")
- 
- (define_bypass 3 "cortex_a53_load1"
--		 "cortex_a53_alu_shift*")
-+		 "cortex_a53_alu_*,cortex_a53_shift*")
-+
-+(define_bypass 3 "cortex_a53_load2"
-+		 "cortex_a53_alu")
- 
- ;; Model a bypass for ALU instructions feeding stores.
- 
--(define_bypass 1 "cortex_a53_alu*"
--		 "cortex_a53_store1,
--		  cortex_a53_store2,
--		  cortex_a53_store3plus"
-+(define_bypass 0 "cortex_a53_alu*,cortex_a53_shift*"
-+		 "cortex_a53_store*"
- 		 "arm_no_early_store_addr_dep")
- 
- ;; Model a bypass for load and multiply instructions feeding stores.
- 
--(define_bypass 2 "cortex_a53_mul,
--		  cortex_a53_load1,
--		  cortex_a53_load2,
--		  cortex_a53_load3plus"
--		 "cortex_a53_store1,
--		  cortex_a53_store2,
--		  cortex_a53_store3plus"
-+(define_bypass 1 "cortex_a53_mul,
-+		  cortex_a53_load*"
-+		 "cortex_a53_store*"
- 		 "arm_no_early_store_addr_dep")
- 
- ;; Model a GP->FP register move as similar to stores.
- 
--(define_bypass 1 "cortex_a53_alu*"
-+(define_bypass 0 "cortex_a53_alu*,cortex_a53_shift*"
- 		 "cortex_a53_r2f")
- 
--(define_bypass 2 "cortex_a53_mul,
-+(define_bypass 1 "cortex_a53_mul,
- 		  cortex_a53_load1,
--		  cortex_a53_load2,
--		  cortex_a53_load3plus"
-+		  cortex_a53_load2"
- 		 "cortex_a53_r2f")
- 
--;; Shifts feeding Load/Store addresses may not be ready in time.
-+(define_bypass 2 "cortex_a53_alu*"
-+		 "cortex_a53_r2f_cvt")
- 
--(define_bypass 3 "cortex_a53_shift"
--		 "cortex_a53_load*"
--		 "arm_early_load_addr_dep")
-+(define_bypass 3 "cortex_a53_mul,
-+		  cortex_a53_load1,
-+		  cortex_a53_load2"
-+		 "cortex_a53_r2f_cvt")
- 
--(define_bypass 3 "cortex_a53_shift"
--		 "cortex_a53_store*"
--		 "arm_early_store_addr_dep")
-+;; Model flag forwarding to branches.
-+
-+(define_bypass 0 "cortex_a53_alu*,cortex_a53_shift*"
-+		 "cortex_a53_branch")
- 
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ;; Floating-point/Advanced SIMD.
-@@ -535,19 +531,25 @@
- ;; Floating-point to/from core transfers.
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- 
--(define_insn_reservation "cortex_a53_r2f" 6
-+(define_insn_reservation "cortex_a53_r2f" 2
-   (and (eq_attr "tune" "cortexa53")
--       (eq_attr "type" "f_mcr,f_mcrr,f_cvti2f,
--			neon_from_gp, neon_from_gp_q"))
--  "cortex_a53_slot_any,cortex_a53_store,
--   nothing,cortex_a53_fp_alu")
-+       (eq_attr "type" "f_mcr,f_mcrr"))
-+  "cortex_a53_slot_any,cortex_a53_fp_alu")
- 
--(define_insn_reservation "cortex_a53_f2r" 6
-+(define_insn_reservation "cortex_a53_f2r" 4
-   (and (eq_attr "tune" "cortexa53")
--       (eq_attr "type" "f_mrc,f_mrrc,f_cvtf2i,
--			neon_to_gp, neon_to_gp_q"))
--  "cortex_a53_slot_any,cortex_a53_fp_alu,
--   nothing,cortex_a53_store")
-+       (eq_attr "type" "f_mrc,f_mrrc"))
-+  "cortex_a53_slot_any,cortex_a53_fp_alu")
-+
-+(define_insn_reservation "cortex_a53_r2f_cvt" 4
-+  (and (eq_attr "tune" "cortexa53")
-+       (eq_attr "type" "f_cvti2f, neon_from_gp, neon_from_gp_q"))
-+  "cortex_a53_slot_any,cortex_a53_fp_alu")
-+
-+(define_insn_reservation "cortex_a53_f2r_cvt" 5
-+  (and (eq_attr "tune" "cortexa53")
-+       (eq_attr "type" "f_cvtf2i, neon_to_gp, neon_to_gp_q"))
-+  "cortex_a53_slot_any,cortex_a53_fp_alu")
- 
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ;; Floating-point flag transfer.
---- a/src/gcc/config/arm/cortex-a57.md
-+++ b/src/gcc/config/arm/cortex-a57.md
-@@ -297,7 +297,7 @@
-        (eq_attr "type" "alu_imm,alus_imm,logic_imm,logics_imm,\
- 			alu_sreg,alus_sreg,logic_reg,logics_reg,\
- 			adc_imm,adcs_imm,adc_reg,adcs_reg,\
--			adr,bfm,clz,rbit,rev,alu_dsp_reg,\
-+			adr,bfx,extend,clz,rbit,rev,alu_dsp_reg,\
- 			rotate_imm,shift_imm,shift_reg,\
- 			mov_imm,mov_reg,\
- 			mvn_imm,mvn_reg,\
-@@ -307,7 +307,7 @@
- ;; ALU ops with immediate shift
- (define_insn_reservation "cortex_a57_alu_shift" 3
-   (and (eq_attr "tune" "cortexa57")
--       (eq_attr "type" "extend,\
-+       (eq_attr "type" "bfm,\
- 			alu_shift_imm,alus_shift_imm,\
- 			crc,logic_shift_imm,logics_shift_imm,\
- 			mov_shift,mvn_shift"))
-@@ -726,7 +726,7 @@
- 
- (define_insn_reservation "cortex_a57_fp_cpys" 4
-   (and (eq_attr "tune" "cortexa57")
--       (eq_attr "type" "fmov"))
-+       (eq_attr "type" "fmov,fcsel"))
-   "(ca57_cx1|ca57_cx2)")
- 
- (define_insn_reservation "cortex_a57_fp_divs" 12
---- a/src/gcc/config/arm/cortex-a8-neon.md
-+++ b/src/gcc/config/arm/cortex-a8-neon.md
-@@ -357,30 +357,34 @@
-        (eq_attr "type" "fmuls"))
-   "cortex_a8_vfp,cortex_a8_vfplite*11")
- 
-+;; Don't model a reservation for more than 15 cycles as this explodes the
-+;; state space of the automaton for little gain.  It is unlikely that the
-+;; scheduler will find enough instructions to hide the full latency of the
-+;; instructions.
- (define_insn_reservation "cortex_a8_vfp_muld" 17
-   (and (eq_attr "tune" "cortexa8")
-        (eq_attr "type" "fmuld"))
--  "cortex_a8_vfp,cortex_a8_vfplite*16")
-+  "cortex_a8_vfp,cortex_a8_vfplite*15")
- 
- (define_insn_reservation "cortex_a8_vfp_macs" 21
-   (and (eq_attr "tune" "cortexa8")
-        (eq_attr "type" "fmacs,ffmas"))
--  "cortex_a8_vfp,cortex_a8_vfplite*20")
-+  "cortex_a8_vfp,cortex_a8_vfplite*15")
- 
- (define_insn_reservation "cortex_a8_vfp_macd" 26
-   (and (eq_attr "tune" "cortexa8")
-        (eq_attr "type" "fmacd,ffmad"))
--  "cortex_a8_vfp,cortex_a8_vfplite*25")
-+  "cortex_a8_vfp,cortex_a8_vfplite*15")
- 
- (define_insn_reservation "cortex_a8_vfp_divs" 37
-   (and (eq_attr "tune" "cortexa8")
-        (eq_attr "type" "fdivs, fsqrts"))
--  "cortex_a8_vfp,cortex_a8_vfplite*36")
-+  "cortex_a8_vfp,cortex_a8_vfplite*15")
- 
- (define_insn_reservation "cortex_a8_vfp_divd" 65
-   (and (eq_attr "tune" "cortexa8")
-        (eq_attr "type" "fdivd, fsqrtd"))
--  "cortex_a8_vfp,cortex_a8_vfplite*64")
-+  "cortex_a8_vfp,cortex_a8_vfplite*15")
- 
- ;; Comparisons can actually take 7 cycles sometimes instead of four,
- ;; but given all the other instructions lumped into type=ffarith that
---- a/src/gcc/config/arm/crypto.md
-+++ b/src/gcc/config/arm/crypto.md
-@@ -18,14 +18,27 @@
- ;; along with GCC; see the file COPYING3.  If not see
- ;; <http://www.gnu.org/licenses/>.
- 
-+
-+;; When AES/AESMC fusion is enabled we want the register allocation to
-+;; look like:
-+;;    AESE Vn, _
-+;;    AESMC Vn, Vn
-+;; So prefer to tie operand 1 to operand 0 when fusing.
-+
- (define_insn "crypto_<crypto_pattern>"
--  [(set (match_operand:<crypto_mode> 0 "register_operand" "=w")
-+  [(set (match_operand:<crypto_mode> 0 "register_operand" "=w,w")
-         (unspec:<crypto_mode> [(match_operand:<crypto_mode> 1
--                       "register_operand" "w")]
-+                       "register_operand" "0,w")]
-          CRYPTO_UNARY))]
-   "TARGET_CRYPTO"
-   "<crypto_pattern>.<crypto_size_sfx>\\t%q0, %q1"
--  [(set_attr "type" "<crypto_type>")]
-+  [(set_attr "type" "<crypto_type>")
-+   (set_attr_alternative "enabled"
-+     [(if_then_else (match_test
-+		       "arm_fusion_enabled_p (tune_params::FUSE_AES_AESMC)")
-+		     (const_string "yes" )
-+		     (const_string "no"))
-+      (const_string "yes")])]
- )
- 
- (define_insn "crypto_<crypto_pattern>"
---- a/src/gcc/config/arm/driver-arm.c
-+++ b/src/gcc/config/arm/driver-arm.c
-@@ -46,6 +46,12 @@ static struct vendor_cpu arm_cpu_table[] = {
-     {"0xc0d", "armv7ve", "cortex-a12"},
-     {"0xc0e", "armv7ve", "cortex-a17"},
-     {"0xc0f", "armv7ve", "cortex-a15"},
-+    {"0xd01", "armv8-a+crc", "cortex-a32"},
-+    {"0xd04", "armv8-a+crc", "cortex-a35"},
-+    {"0xd03", "armv8-a+crc", "cortex-a53"},
-+    {"0xd07", "armv8-a+crc", "cortex-a57"},
-+    {"0xd08", "armv8-a+crc", "cortex-a72"},
-+    {"0xd09", "armv8-a+crc", "cortex-a73"},
-     {"0xc14", "armv7-r", "cortex-r4"},
-     {"0xc15", "armv7-r", "cortex-r5"},
-     {"0xc20", "armv6-m", "cortex-m0"},
---- a/src/gcc/config/arm/elf.h
-+++ b/src/gcc/config/arm/elf.h
-@@ -75,16 +75,7 @@
- 
- /* We might need a ARM specific header to function declarations.  */
- #undef  ASM_DECLARE_FUNCTION_NAME
--#define ASM_DECLARE_FUNCTION_NAME(FILE, NAME, DECL)		\
--  do								\
--    {								\
--      ARM_DECLARE_FUNCTION_NAME (FILE, NAME, DECL);		\
--      ASM_OUTPUT_TYPE_DIRECTIVE (FILE, NAME, "function");	\
--      ASM_DECLARE_RESULT (FILE, DECL_RESULT (DECL));		\
--      ASM_OUTPUT_LABEL(FILE, NAME);				\
--      ARM_OUTPUT_FN_UNWIND (FILE, TRUE);			\
--    }								\
--  while (0)
-+#define ASM_DECLARE_FUNCTION_NAME arm_asm_declare_function_name
- 
- /* We might need an ARM specific trailer for function declarations.  */
- #undef  ASM_DECLARE_FUNCTION_SIZE
-@@ -148,8 +139,9 @@
-   while (0)
- 
- /* Horrible hack: We want to prevent some libgcc routines being included
--   for some multilibs.  */
--#ifndef __ARM_ARCH_6M__
-+   for some multilibs.  The condition should match the one in
-+   libgcc/config/arm/lib1funcs.S.  */
-+#if __ARM_ARCH_ISA_ARM || __ARM_ARCH_ISA_THUMB != 1
- #undef L_fixdfsi
- #undef L_fixunsdfsi
- #undef L_truncdfsf2
---- a/src/gcc/config/arm/exynos-m1.md
-+++ b/src/gcc/config/arm/exynos-m1.md
-@@ -358,7 +358,7 @@
- 	    (eq_attr "type" "alu_imm, alus_imm, logic_imm, logics_imm,\
- 			     alu_sreg, alus_sreg, logic_reg, logics_reg,\
- 			     adc_imm, adcs_imm, adc_reg, adcs_reg,\
--			     adr, bfm, clz, rbit, rev, csel, alu_dsp_reg,\
-+			     adr, bfm, bfx, clz, rbit, rev, csel, alu_dsp_reg,\
- 			     shift_imm, shift_reg, rotate_imm, extend,\
- 			     mov_imm, mov_reg,\
- 			     mvn_imm, mvn_reg,\
-@@ -372,7 +372,7 @@
- 	    (eq_attr "type" "alu_imm, alus_imm, logic_imm, logics_imm,\
- 			     alu_sreg, alus_sreg, logic_reg, logics_reg,\
- 			     adc_imm, adcs_imm, adc_reg, adcs_reg,\
--			     adr, bfm, clz, rbit, rev, alu_dsp_reg,\
-+			     adr, bfm, bfx, clz, rbit, rev, alu_dsp_reg,\
- 			     shift_imm, shift_reg, rotate_imm, extend,\
- 			     mov_imm, mov_reg,\
- 			     mvn_imm, mvn_reg,\
---- a/src/gcc/config/arm/iterators.md
-+++ b/src/gcc/config/arm/iterators.md
-@@ -46,7 +46,7 @@
- (define_mode_iterator SIDI [SI DI])
- 
- ;; A list of modes which the VFP unit can handle
--(define_mode_iterator SDF [(SF "TARGET_VFP") (DF "TARGET_VFP_DOUBLE")])
-+(define_mode_iterator SDF [(SF "") (DF "TARGET_VFP_DOUBLE")])
- 
- ;; Integer element sizes implemented by IWMMXT.
- (define_mode_iterator VMMX [V2SI V4HI V8QI])
-@@ -119,6 +119,10 @@
- ;; All supported vector modes (except those with 64-bit integer elements).
- (define_mode_iterator VDQW [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF])
- 
-+;; All supported vector modes including 16-bit float modes.
-+(define_mode_iterator VDQWH [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF
-+			     V8HF V4HF])
-+
- ;; Supported integer vector modes (not 64 bit elements).
- (define_mode_iterator VDQIW [V8QI V16QI V4HI V8HI V2SI V4SI])
- 
-@@ -141,6 +145,9 @@
- ;; Vector modes form int->float conversions.
- (define_mode_iterator VCVTI [V2SI V4SI])
- 
-+;; Vector modes for int->half conversions.
-+(define_mode_iterator VCVTHI [V4HI V8HI])
-+
- ;; Vector modes for doubleword multiply-accumulate, etc. insns.
- (define_mode_iterator VMD [V4HI V2SI V2SF])
- 
-@@ -174,6 +181,9 @@
- ;; Modes with 8-bit, 16-bit and 32-bit elements.
- (define_mode_iterator VU [V16QI V8HI V4SI])
- 
-+;; Vector modes for 16-bit floating-point support.
-+(define_mode_iterator VH [V8HF V4HF])
-+
- ;; Iterators used for fixed-point support.
- (define_mode_iterator FIXED [QQ HQ SQ UQQ UHQ USQ HA SA UHA USA])
- 
-@@ -192,14 +202,17 @@
- ;; Code iterators
- ;;----------------------------------------------------------------------------
- 
--;; A list of condition codes used in compare instructions where 
--;; the carry flag from the addition is used instead of doing the 
-+;; A list of condition codes used in compare instructions where
-+;; the carry flag from the addition is used instead of doing the
- ;; compare a second time.
- (define_code_iterator LTUGEU [ltu geu])
- 
- ;; The signed gt, ge comparisons
- (define_code_iterator GTGE [gt ge])
- 
-+;; The signed gt, ge, lt, le comparisons
-+(define_code_iterator GLTE [gt ge lt le])
-+
- ;; The unsigned gt, ge comparisons
- (define_code_iterator GTUGEU [gtu geu])
- 
-@@ -228,6 +241,12 @@
- ;; Binary operators whose second operand can be shifted.
- (define_code_iterator SHIFTABLE_OPS [plus minus ior xor and])
- 
-+;; Operations on the sign of a number.
-+(define_code_iterator ABSNEG [abs neg])
-+
-+;; Conversions.
-+(define_code_iterator FCVT [unsigned_float float])
-+
- ;; plus and minus are the only SHIFTABLE_OPS for which Thumb2 allows
- ;; a stack pointer opoerand.  The minus operation is a candidate for an rsub
- ;; and hence only plus is supported.
-@@ -251,10 +270,14 @@
- (define_int_iterator VRINT [UNSPEC_VRINTZ UNSPEC_VRINTP UNSPEC_VRINTM
-                             UNSPEC_VRINTR UNSPEC_VRINTX UNSPEC_VRINTA])
- 
--(define_int_iterator NEON_VCMP [UNSPEC_VCEQ UNSPEC_VCGT UNSPEC_VCGE UNSPEC_VCLT UNSPEC_VCLE])
-+(define_int_iterator NEON_VCMP [UNSPEC_VCEQ UNSPEC_VCGT UNSPEC_VCGE
-+				UNSPEC_VCLT UNSPEC_VCLE])
- 
- (define_int_iterator NEON_VACMP [UNSPEC_VCAGE UNSPEC_VCAGT])
- 
-+(define_int_iterator NEON_VAGLTE [UNSPEC_VCAGE UNSPEC_VCAGT
-+				  UNSPEC_VCALE UNSPEC_VCALT])
-+
- (define_int_iterator VCVT [UNSPEC_VRINTP UNSPEC_VRINTM UNSPEC_VRINTA])
- 
- (define_int_iterator NEON_VRINT [UNSPEC_NVRINTP UNSPEC_NVRINTZ UNSPEC_NVRINTM
-@@ -323,6 +346,22 @@
- 
- (define_int_iterator VCVT_US_N [UNSPEC_VCVT_S_N UNSPEC_VCVT_U_N])
- 
-+(define_int_iterator VCVT_HF_US_N [UNSPEC_VCVT_HF_S_N UNSPEC_VCVT_HF_U_N])
-+
-+(define_int_iterator VCVT_SI_US_N [UNSPEC_VCVT_SI_S_N UNSPEC_VCVT_SI_U_N])
-+
-+(define_int_iterator VCVT_HF_US [UNSPEC_VCVTA_S UNSPEC_VCVTA_U
-+				 UNSPEC_VCVTM_S UNSPEC_VCVTM_U
-+				 UNSPEC_VCVTN_S UNSPEC_VCVTN_U
-+				 UNSPEC_VCVTP_S UNSPEC_VCVTP_U])
-+
-+(define_int_iterator VCVTH_US [UNSPEC_VCVTH_S UNSPEC_VCVTH_U])
-+
-+;; Operators for FP16 instructions.
-+(define_int_iterator FP16_RND [UNSPEC_VRND UNSPEC_VRNDA
-+			       UNSPEC_VRNDM UNSPEC_VRNDN
-+			       UNSPEC_VRNDP UNSPEC_VRNDX])
-+
- (define_int_iterator VQMOVN [UNSPEC_VQMOVN_S UNSPEC_VQMOVN_U])
- 
- (define_int_iterator VMOVL [UNSPEC_VMOVL_S UNSPEC_VMOVL_U])
-@@ -366,6 +405,8 @@
- 
- (define_int_iterator VQRDMLH_AS [UNSPEC_VQRDMLAH UNSPEC_VQRDMLSH])
- 
-+(define_int_iterator VFM_LANE_AS [UNSPEC_VFMA_LANE UNSPEC_VFMS_LANE])
-+
- ;;----------------------------------------------------------------------------
- ;; Mode attributes
- ;;----------------------------------------------------------------------------
-@@ -384,6 +425,10 @@
- (define_mode_attr V_cvtto [(V2SI "v2sf") (V2SF "v2si")
-                            (V4SI "v4sf") (V4SF "v4si")])
- 
-+;; (Opposite) mode to convert to/from for vector-half mode conversions.
-+(define_mode_attr VH_CVTTO [(V4HI "V4HF") (V4HF "V4HI")
-+			    (V8HI "V8HF") (V8HF "V8HI")])
-+
- ;; Define element mode for each vector mode.
- (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")
- 			  (V4HI "HI") (V8HI "HI")
-@@ -427,12 +472,13 @@
- 
- ;; Register width from element mode
- (define_mode_attr V_reg [(V8QI "P") (V16QI "q")
--                         (V4HI "P") (V8HI  "q")
--                         (V4HF "P") (V8HF  "q")
--                         (V2SI "P") (V4SI  "q")
--                         (V2SF "P") (V4SF  "q")
--                         (DI   "P") (V2DI  "q")
--                         (SF   "")  (DF    "P")])
-+			 (V4HI "P") (V8HI  "q")
-+			 (V4HF "P") (V8HF  "q")
-+			 (V2SI "P") (V4SI  "q")
-+			 (V2SF "P") (V4SF  "q")
-+			 (DI   "P") (V2DI  "q")
-+			 (SF   "")  (DF    "P")
-+			 (HF   "")])
- 
- ;; Wider modes with the same number of elements.
- (define_mode_attr V_widen [(V8QI "V8HI") (V4HI "V4SI") (V2SI "V2DI")])
-@@ -448,7 +494,7 @@
- (define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI")
- 			  (V8HF "V4HF") (V4SI  "V2SI")
- 			  (V4SF "V2SF") (V2DF "DF")
--                          (V2DI "DI")])
-+			  (V2DI "DI") (V4HF "HF")])
- 
- ;; Same, but lower-case.
- (define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi")
-@@ -475,9 +521,10 @@
- ;; Used for neon_vdup_lane, where the second operand is double-sized
- ;; even when the first one is quad.
- (define_mode_attr V_double_vector_mode [(V16QI "V8QI") (V8HI "V4HI")
--                                        (V4SI "V2SI") (V4SF "V2SF")
--                                        (V8QI "V8QI") (V4HI "V4HI")
--                                        (V2SI "V2SI") (V2SF "V2SF")])
-+					(V4SI "V2SI") (V4SF "V2SF")
-+					(V8QI "V8QI") (V4HI "V4HI")
-+					(V2SI "V2SI") (V2SF "V2SF")
-+					(V8HF "V4HF") (V4HF "V4HF")])
- 
- ;; Mode of result of comparison operations (and bit-select operand 1).
- (define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI")
-@@ -496,18 +543,22 @@
- ;; Get element type from double-width mode, for operations where we 
- ;; don't care about signedness.
- (define_mode_attr V_if_elem [(V8QI "i8")  (V16QI "i8")
--                 (V4HI "i16") (V8HI  "i16")
--                             (V2SI "i32") (V4SI  "i32")
--                             (DI   "i64") (V2DI  "i64")
--                 (V2SF "f32") (V4SF  "f32")
--                 (SF "f32") (DF "f64")])
-+			     (V4HI "i16") (V8HI  "i16")
-+			     (V2SI "i32") (V4SI  "i32")
-+			     (DI   "i64") (V2DI  "i64")
-+			     (V2SF "f32") (V4SF  "f32")
-+			     (SF   "f32") (DF    "f64")
-+			     (HF   "f16") (V4HF  "f16")
-+			     (V8HF "f16")])
- 
- ;; Same, but for operations which work on signed values.
- (define_mode_attr V_s_elem [(V8QI "s8")  (V16QI "s8")
--                (V4HI "s16") (V8HI  "s16")
--                            (V2SI "s32") (V4SI  "s32")
--                            (DI   "s64") (V2DI  "s64")
--                (V2SF "f32") (V4SF  "f32")])
-+			    (V4HI "s16") (V8HI  "s16")
-+			    (V2SI "s32") (V4SI  "s32")
-+			    (DI   "s64") (V2DI  "s64")
-+			    (V2SF "f32") (V4SF  "f32")
-+			    (HF   "f16") (V4HF  "f16")
-+			    (V8HF "f16")])
- 
- ;; Same, but for operations which work on unsigned values.
- (define_mode_attr V_u_elem [(V8QI "u8")  (V16QI "u8")
-@@ -524,17 +575,22 @@
-                              (V2SF "32") (V4SF "32")])
- 
- (define_mode_attr V_sz_elem [(V8QI "8")  (V16QI "8")
--                 (V4HI "16") (V8HI  "16")
--                             (V2SI "32") (V4SI  "32")
--                             (DI   "64") (V2DI  "64")
-+			     (V4HI "16") (V8HI  "16")
-+			     (V2SI "32") (V4SI  "32")
-+			     (DI   "64") (V2DI  "64")
- 			     (V4HF "16") (V8HF "16")
--                 (V2SF "32") (V4SF  "32")])
-+			     (V2SF "32") (V4SF  "32")])
- 
- (define_mode_attr V_elem_ch [(V8QI "b")  (V16QI "b")
--                             (V4HI "h") (V8HI  "h")
--                             (V2SI "s") (V4SI  "s")
--                             (DI   "d") (V2DI  "d")
--                             (V2SF "s") (V4SF  "s")])
-+			     (V4HI "h") (V8HI  "h")
-+			     (V2SI "s") (V4SI  "s")
-+			     (DI   "d") (V2DI  "d")
-+			     (V2SF "s") (V4SF  "s")
-+			     (V2SF "s") (V4SF  "s")])
-+
-+(define_mode_attr VH_elem_ch [(V4HI "s") (V8HI  "s")
-+			      (V4HF "s") (V8HF  "s")
-+			      (HF "s")])
- 
- ;; Element sizes for duplicating ARM registers to all elements of a vector.
- (define_mode_attr VD_dup [(V8QI "8") (V4HI "16") (V2SI "32") (V2SF "32")])
-@@ -570,29 +626,30 @@
- ;; This mode attribute is used to obtain the correct register constraints.
- 
- (define_mode_attr scalar_mul_constraint [(V4HI "x") (V2SI "t") (V2SF "t")
--                                         (V8HI "x") (V4SI "t") (V4SF "t")])
-+					 (V8HI "x") (V4SI "t") (V4SF "t")
-+					 (V8HF "x") (V4HF "x")])
- 
- ;; Predicates used for setting type for neon instructions
- 
- (define_mode_attr Is_float_mode [(V8QI "false") (V16QI "false")
--                 (V4HI "false") (V8HI "false")
--                 (V2SI "false") (V4SI "false")
--                 (V4HF "true") (V8HF "true")
--                 (V2SF "true") (V4SF "true")
--                 (DI "false") (V2DI "false")])
-+				 (V4HI "false") (V8HI "false")
-+				 (V2SI "false") (V4SI "false")
-+				 (V4HF "true") (V8HF "true")
-+				 (V2SF "true") (V4SF "true")
-+				 (DI "false") (V2DI "false")])
- 
- (define_mode_attr Scalar_mul_8_16 [(V8QI "true") (V16QI "true")
--                   (V4HI "true") (V8HI "true")
--                   (V2SI "false") (V4SI "false")
--                   (V2SF "false") (V4SF "false")
--                   (DI "false") (V2DI "false")])
--
-+				   (V4HI "true") (V8HI "true")
-+				   (V2SI "false") (V4SI "false")
-+				   (V2SF "false") (V4SF "false")
-+				   (DI "false") (V2DI "false")])
- 
- (define_mode_attr Is_d_reg [(V8QI "true") (V16QI "false")
--                            (V4HI "true") (V8HI  "false")
--                            (V2SI "true") (V4SI  "false")
--                            (V2SF "true") (V4SF  "false")
--                            (DI   "true") (V2DI  "false")])
-+			    (V4HI "true") (V8HI  "false")
-+			    (V2SI "true") (V4SI  "false")
-+			    (V2SF "true") (V4SF  "false")
-+			    (DI   "true") (V2DI  "false")
-+			    (V4HF "true") (V8HF  "false")])
- 
- (define_mode_attr V_mode_nunits [(V8QI "8") (V16QI "16")
- 				 (V4HF "4") (V8HF "8")
-@@ -637,12 +694,14 @@
- 
- ;; Mode attribute used to build the "type" attribute.
- (define_mode_attr q [(V8QI "") (V16QI "_q")
--                     (V4HI "") (V8HI "_q")
--                     (V2SI "") (V4SI "_q")
-+		     (V4HI "") (V8HI "_q")
-+		     (V2SI "") (V4SI "_q")
-+		     (V4HF "") (V8HF "_q")
-+		     (V2SF "") (V4SF "_q")
- 		     (V4HF "") (V8HF "_q")
--                     (V2SF "") (V4SF "_q")
--                     (DI "")   (V2DI "_q")
--                     (DF "")   (V2DF "_q")])
-+		     (DI "")   (V2DI "_q")
-+		     (DF "")   (V2DF "_q")
-+		     (HF "")])
- 
- (define_mode_attr pf [(V8QI "p") (V16QI "p") (V2SF "f") (V4SF "f")])
- 
-@@ -679,6 +738,16 @@
- (define_code_attr shift [(ashiftrt "ashr") (lshiftrt "lshr")])
- (define_code_attr shifttype [(ashiftrt "signed") (lshiftrt "unsigned")])
- 
-+;; String reprentations of operations on the sign of a number.
-+(define_code_attr absneg_str [(abs "abs") (neg "neg")])
-+
-+;; Conversions.
-+(define_code_attr FCVTI32typename [(unsigned_float "u32") (float "s32")])
-+
-+(define_code_attr float_sup [(unsigned_float "u") (float "s")])
-+
-+(define_code_attr float_SUP [(unsigned_float "U") (float "S")])
-+
- ;;----------------------------------------------------------------------------
- ;; Int attributes
- ;;----------------------------------------------------------------------------
-@@ -710,7 +779,13 @@
-   (UNSPEC_VPMAX "s") (UNSPEC_VPMAX_U "u")
-   (UNSPEC_VPMIN "s") (UNSPEC_VPMIN_U "u")
-   (UNSPEC_VCVT_S "s") (UNSPEC_VCVT_U "u")
-+  (UNSPEC_VCVTA_S "s") (UNSPEC_VCVTA_U "u")
-+  (UNSPEC_VCVTM_S "s") (UNSPEC_VCVTM_U "u")
-+  (UNSPEC_VCVTN_S "s") (UNSPEC_VCVTN_U "u")
-+  (UNSPEC_VCVTP_S "s") (UNSPEC_VCVTP_U "u")
-   (UNSPEC_VCVT_S_N "s") (UNSPEC_VCVT_U_N "u")
-+  (UNSPEC_VCVT_HF_S_N "s") (UNSPEC_VCVT_HF_U_N "u")
-+  (UNSPEC_VCVT_SI_S_N "s") (UNSPEC_VCVT_SI_U_N "u")
-   (UNSPEC_VQMOVN_S "s") (UNSPEC_VQMOVN_U "u")
-   (UNSPEC_VMOVL_S "s") (UNSPEC_VMOVL_U "u")
-   (UNSPEC_VSHL_S "s") (UNSPEC_VSHL_U "u")
-@@ -725,13 +800,30 @@
-   (UNSPEC_VSHLL_S_N "s") (UNSPEC_VSHLL_U_N "u")
-   (UNSPEC_VSRA_S_N "s") (UNSPEC_VSRA_U_N "u")
-   (UNSPEC_VRSRA_S_N "s") (UNSPEC_VRSRA_U_N "u")
--
-+  (UNSPEC_VCVTH_S "s") (UNSPEC_VCVTH_U "u")
- ])
- 
-+(define_int_attr vcvth_op
-+ [(UNSPEC_VCVTA_S "a") (UNSPEC_VCVTA_U "a")
-+  (UNSPEC_VCVTM_S "m") (UNSPEC_VCVTM_U "m")
-+  (UNSPEC_VCVTN_S "n") (UNSPEC_VCVTN_U "n")
-+  (UNSPEC_VCVTP_S "p") (UNSPEC_VCVTP_U "p")])
-+
-+(define_int_attr fp16_rnd_str
-+  [(UNSPEC_VRND "rnd") (UNSPEC_VRNDA "rnda")
-+   (UNSPEC_VRNDM "rndm") (UNSPEC_VRNDN "rndn")
-+   (UNSPEC_VRNDP "rndp") (UNSPEC_VRNDX "rndx")])
-+
-+(define_int_attr fp16_rnd_insn
-+  [(UNSPEC_VRND "vrintz") (UNSPEC_VRNDA "vrinta")
-+   (UNSPEC_VRNDM "vrintm") (UNSPEC_VRNDN "vrintn")
-+   (UNSPEC_VRNDP "vrintp") (UNSPEC_VRNDX "vrintx")])
-+
- (define_int_attr cmp_op_unsp [(UNSPEC_VCEQ "eq") (UNSPEC_VCGT "gt")
--                              (UNSPEC_VCGE "ge") (UNSPEC_VCLE "le")
--                              (UNSPEC_VCLT "lt") (UNSPEC_VCAGE "ge")
--                              (UNSPEC_VCAGT "gt")])
-+			      (UNSPEC_VCGE "ge") (UNSPEC_VCLE "le")
-+			      (UNSPEC_VCLT "lt") (UNSPEC_VCAGE "ge")
-+			      (UNSPEC_VCAGT "gt") (UNSPEC_VCALE "le")
-+			      (UNSPEC_VCALT "lt")])
- 
- (define_int_attr r [
-   (UNSPEC_VRHADD_S "r") (UNSPEC_VRHADD_U "r")
-@@ -847,3 +939,7 @@
- 
- ;; Attributes for VQRDMLAH/VQRDMLSH
- (define_int_attr neon_rdma_as [(UNSPEC_VQRDMLAH "a") (UNSPEC_VQRDMLSH "s")])
-+
-+;; Attributes for VFMA_LANE/ VFMS_LANE
-+(define_int_attr neon_vfm_lane_as
-+ [(UNSPEC_VFMA_LANE "a") (UNSPEC_VFMS_LANE "s")])
---- a/src/gcc/config/arm/neon-testgen.ml
-+++ b/src//dev/null
-@@ -1,324 +0,0 @@
--(* Auto-generate ARM Neon intrinsics tests.
--   Copyright (C) 2006-2016 Free Software Foundation, Inc.
--   Contributed by CodeSourcery.
--
--   This file is part of GCC.
--
--   GCC is free software; you can redistribute it and/or modify it under
--   the terms of the GNU General Public License as published by the Free
--   Software Foundation; either version 3, or (at your option) any later
--   version.
--
--   GCC is distributed in the hope that it will be useful, but WITHOUT ANY
--   WARRANTY; without even the implied warranty of MERCHANTABILITY or
--   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
--   for more details.
--
--   You should have received a copy of the GNU General Public License
--   along with GCC; see the file COPYING3.  If not see
--   <http://www.gnu.org/licenses/>.
--
--   This is an O'Caml program.  The O'Caml compiler is available from:
--
--     http://caml.inria.fr/
--
--   Or from your favourite OS's friendly packaging system. Tested with version
--   3.09.2, though other versions will probably work too.
--
--   Compile with:
--     ocamlc -c neon.ml
--     ocamlc -o neon-testgen neon.cmo neon-testgen.ml
--
--   Run with:
--     cd /path/to/gcc/testsuite/gcc.target/arm/neon
--     /path/to/neon-testgen
--*)
--
--open Neon
--
--type c_type_flags = Pointer | Const
--
--(* Open a test source file.  *)
--let open_test_file dir name =
--  try
--    open_out (dir ^ "/" ^ name ^ ".c")
--  with Sys_error str ->
--    failwith ("Could not create test source file " ^ name ^ ": " ^ str)
--
--(* Emit prologue code to a test source file.  *)
--let emit_prologue chan test_name effective_target compile_test_optim =
--  Printf.fprintf chan "/* Test the `%s' ARM Neon intrinsic.  */\n" test_name;
--  Printf.fprintf chan "/* This file was autogenerated by neon-testgen.  */\n\n";
--  Printf.fprintf chan "/* { dg-do assemble } */\n";
--  Printf.fprintf chan "/* { dg-require-effective-target %s_ok } */\n"
--                 effective_target;
--  Printf.fprintf chan "/* { dg-options \"-save-temps %s\" } */\n" compile_test_optim;
--  Printf.fprintf chan "/* { dg-add-options %s } */\n" effective_target;
--  Printf.fprintf chan "\n#include \"arm_neon.h\"\n\n"
--
--(* Emit declarations of variables that are going to be passed
--   to an intrinsic, together with one to take a returned value if needed.  *)
--let emit_variables chan c_types features spaces =
--  let emit () =
--    ignore (
--      List.fold_left (fun arg_number -> fun (flags, ty) ->
--                        let pointer_bit =
--                          if List.mem Pointer flags then "*" else ""
--                        in
--                          (* Const arguments to builtins are directly
--                             written in as constants.  *)
--                          if not (List.mem Const flags) then
--                            Printf.fprintf chan "%s%s %sarg%d_%s;\n"
--                                           spaces ty pointer_bit arg_number ty;
--                        arg_number + 1)
--                     0 (List.tl c_types))
--  in
--    match c_types with
--      (_, return_ty) :: tys ->
--        if return_ty <> "void" then begin
--          (* The intrinsic returns a value.  We need to do explicit register
--             allocation for vget_low tests or they fail because of copy
--             elimination.  *)
--          ((if List.mem Fixed_vector_reg features then
--              Printf.fprintf chan "%sregister %s out_%s asm (\"d18\");\n"
--                             spaces return_ty return_ty
--            else if List.mem Fixed_core_reg features then
--              Printf.fprintf chan "%sregister %s out_%s asm (\"r0\");\n"
--                             spaces return_ty return_ty
--            else
--              Printf.fprintf chan "%s%s out_%s;\n" spaces return_ty return_ty);
--	   emit ())
--        end else
--          (* The intrinsic does not return a value.  *)
--          emit ()
--    | _ -> assert false
--
--(* Emit code to call an intrinsic.  *)
--let emit_call chan const_valuator c_types name elt_ty =
--  (if snd (List.hd c_types) <> "void" then
--     Printf.fprintf chan "  out_%s = " (snd (List.hd c_types))
--   else
--     Printf.fprintf chan "  ");
--  Printf.fprintf chan "%s_%s (" (intrinsic_name name) (string_of_elt elt_ty);
--  let print_arg chan arg_number (flags, ty) =
--    (* If the argument is of const type, then directly write in the
--       constant now.  *)
--    if List.mem Const flags then
--      match const_valuator with
--        None ->
--          if List.mem Pointer flags then
--            Printf.fprintf chan "0"
--          else
--            Printf.fprintf chan "1"
--      | Some f -> Printf.fprintf chan "%s" (string_of_int (f arg_number))
--    else
--      Printf.fprintf chan "arg%d_%s" arg_number ty
--  in
--  let rec print_args arg_number tys =
--    match tys with
--      [] -> ()
--    | [ty] -> print_arg chan arg_number ty
--    | ty::tys ->
--      print_arg chan arg_number ty;
--      Printf.fprintf chan ", ";
--      print_args (arg_number + 1) tys
--  in
--    print_args 0 (List.tl c_types);
--    Printf.fprintf chan ");\n"
--
--(* Emit epilogue code to a test source file.  *)
--let emit_epilogue chan features regexps =
--  let no_op = List.exists (fun feature -> feature = No_op) features in
--    Printf.fprintf chan "}\n\n";
--    if not no_op then
--      List.iter (fun regexp ->
--                  Printf.fprintf chan
--                    "/* { dg-final { scan-assembler \"%s\" } } */\n" regexp)
--                regexps
--    else
--      ()
--    
--
--(* Check a list of C types to determine which ones are pointers and which
--   ones are const.  *)
--let check_types tys =
--  let tys' =
--    List.map (fun ty ->
--                let len = String.length ty in
--                  if len > 2 && String.get ty (len - 2) = ' '
--                             && String.get ty (len - 1) = '*'
--                  then ([Pointer], String.sub ty 0 (len - 2))
--                  else ([], ty)) tys
--  in
--    List.map (fun (flags, ty) ->
--                if String.length ty > 6 && String.sub ty 0 6 = "const "
--                then (Const :: flags, String.sub ty 6 ((String.length ty) - 6))
--                else (flags, ty)) tys'
--
--(* Work out what the effective target should be.  *)
--let effective_target features =
--  try
--    match List.find (fun feature ->
--                       match feature with Requires_feature _ -> true
--                                        | Requires_arch _ -> true
--                                        | Requires_FP_bit 1 -> true
--                                        | _ -> false)
--                     features with
--      Requires_feature "FMA" -> "arm_neonv2"
--    | Requires_feature "CRYPTO" -> "arm_crypto"
--    | Requires_arch 8 -> "arm_v8_neon"
--    | Requires_FP_bit 1 -> "arm_neon_fp16"
--    | _ -> assert false
--  with Not_found -> "arm_neon"
--
--(* Work out what the testcase optimization level should be, default to -O0.  *)
--let compile_test_optim features =
--  try
--    match List.find (fun feature ->
--                       match feature with Compiler_optim _ -> true
--                                        | _ -> false)
--                     features with
--      Compiler_optim opt -> opt
--    | _ -> assert false
--  with Not_found -> "-O0"
--
--(* Given an intrinsic shape, produce a regexp that will match
--   the right-hand sides of instructions generated by an intrinsic of
--   that shape.  *)
--let rec analyze_shape shape =
--  let rec n_things n thing =
--    match n with
--      0 -> []
--    | n -> thing :: (n_things (n - 1) thing)
--  in
--  let rec analyze_shape_elt elt =
--    match elt with
--      Dreg -> "\\[dD\\]\\[0-9\\]+"
--    | Qreg -> "\\[qQ\\]\\[0-9\\]+"
--    | Corereg -> "\\[rR\\]\\[0-9\\]+"
--    | Immed -> "#\\[0-9\\]+"
--    | VecArray (1, elt) ->
--        let elt_regexp = analyze_shape_elt elt in
--          "((\\\\\\{" ^ elt_regexp ^ "\\\\\\})|(" ^ elt_regexp ^ "))"
--    | VecArray (n, elt) ->
--      let elt_regexp = analyze_shape_elt elt in
--      let alt1 = elt_regexp ^ "-" ^ elt_regexp in
--      let alt2 = commas (fun x -> x) (n_things n elt_regexp) "" in
--        "\\\\\\{((" ^ alt1 ^ ")|(" ^ alt2 ^ "))\\\\\\}"
--    | (PtrTo elt | CstPtrTo elt) ->
--      "\\\\\\[" ^ (analyze_shape_elt elt) ^ "\\(:\\[0-9\\]+\\)?\\\\\\]"
--    | Element_of_dreg -> (analyze_shape_elt Dreg) ^ "\\\\\\[\\[0-9\\]+\\\\\\]"
--    | Element_of_qreg -> (analyze_shape_elt Qreg) ^ "\\\\\\[\\[0-9\\]+\\\\\\]"
--    | All_elements_of_dreg -> (analyze_shape_elt Dreg) ^ "\\\\\\[\\\\\\]"
--    | Alternatives (elts) -> "(" ^ (String.concat "|" (List.map analyze_shape_elt elts)) ^ ")"
--  in
--    match shape with
--      All (n, elt) -> commas analyze_shape_elt (n_things n elt) ""
--    | Long -> (analyze_shape_elt Qreg) ^ ", " ^ (analyze_shape_elt Dreg) ^
--              ", " ^ (analyze_shape_elt Dreg)
--    | Long_noreg elt -> (analyze_shape_elt elt) ^ ", " ^ (analyze_shape_elt elt)
--    | Wide -> (analyze_shape_elt Qreg) ^ ", " ^ (analyze_shape_elt Qreg) ^
--              ", " ^ (analyze_shape_elt Dreg)
--    | Wide_noreg elt -> analyze_shape (Long_noreg elt)
--    | Narrow -> (analyze_shape_elt Dreg) ^ ", " ^ (analyze_shape_elt Qreg) ^
--                ", " ^ (analyze_shape_elt Qreg)
--    | Use_operands elts -> commas analyze_shape_elt (Array.to_list elts) ""
--    | By_scalar Dreg ->
--        analyze_shape (Use_operands [| Dreg; Dreg; Element_of_dreg |])
--    | By_scalar Qreg ->
--        analyze_shape (Use_operands [| Qreg; Qreg; Element_of_dreg |])
--    | By_scalar _ -> assert false
--    | Wide_lane ->
--        analyze_shape (Use_operands [| Qreg; Dreg; Element_of_dreg |])
--    | Wide_scalar ->
--        analyze_shape (Use_operands [| Qreg; Dreg; Element_of_dreg |])
--    | Pair_result elt ->
--      let elt_regexp = analyze_shape_elt elt in
--        elt_regexp ^ ", " ^ elt_regexp
--    | Unary_scalar _ -> "FIXME Unary_scalar"
--    | Binary_imm elt -> analyze_shape (Use_operands [| elt; elt; Immed |])
--    | Narrow_imm -> analyze_shape (Use_operands [| Dreg; Qreg; Immed |])
--    | Long_imm -> analyze_shape (Use_operands [| Qreg; Dreg; Immed |])
--
--(* Generate tests for one intrinsic.  *)
--let test_intrinsic dir opcode features shape name munge elt_ty =
--  (* Open the test source file.  *)
--  let test_name = name ^ (string_of_elt elt_ty) in
--  let chan = open_test_file dir test_name in
--  (* Work out what argument and return types the intrinsic has.  *)
--  let c_arity, new_elt_ty = munge shape elt_ty in
--  let c_types = check_types (strings_of_arity c_arity) in
--  (* Extract any constant valuator (a function specifying what constant
--     values are to be written into the intrinsic call) from the features
--     list.  *)
--  let const_valuator =
--    try
--      match (List.find (fun feature -> match feature with
--                                         Const_valuator _ -> true
--				       | _ -> false) features) with
--        Const_valuator f -> Some f
--      | _ -> assert false
--    with Not_found -> None
--  in
--  (* Work out what instruction name(s) to expect.  *)
--  let insns = get_insn_names features name in
--  let no_suffix = (new_elt_ty = NoElts) in
--  let insns =
--    if no_suffix then insns
--                 else List.map (fun insn ->
--                                  let suffix = string_of_elt_dots new_elt_ty in
--                                    insn ^ "\\." ^ suffix) insns
--  in
--  (* Construct a regexp to match against the expected instruction name(s).  *)
--  let insn_regexp =
--    match insns with
--      [] -> assert false
--    | [insn] -> insn
--    | _ ->
--      let rec calc_regexp insns cur_regexp =
--        match insns with
--          [] -> cur_regexp
--        | [insn] -> cur_regexp ^ "(" ^ insn ^ "))"
--        | insn::insns -> calc_regexp insns (cur_regexp ^ "(" ^ insn ^ ")|")
--      in calc_regexp insns "("
--  in
--  (* Construct regexps to match against the instructions that this
--     intrinsic expands to.  Watch out for any writeback character and
--     comments after the instruction.  *)
--  let regexps = List.map (fun regexp -> insn_regexp ^ "\\[ \t\\]+" ^ regexp ^
--			  "!?\\(\\[ \t\\]+@\\[a-zA-Z0-9 \\]+\\)?\\n")
--                         (analyze_all_shapes features shape analyze_shape)
--  in
--  let effective_target = effective_target features in
--  let compile_test_optim = compile_test_optim features
--  in
--    (* Emit file and function prologues.  *)
--    emit_prologue chan test_name effective_target compile_test_optim;
--
--    if (compare compile_test_optim "-O0") <> 0 then
--        (* Emit variable declarations.  *)
--        emit_variables chan c_types features "";
--
--    Printf.fprintf chan "void test_%s (void)\n{\n" test_name;
--
--    if compare compile_test_optim "-O0" = 0 then
--        (* Emit variable declarations.  *)
--        emit_variables chan c_types features "  ";
--
--    Printf.fprintf chan "\n";
--    (* Emit the call to the intrinsic.  *)
--    emit_call chan const_valuator c_types name elt_ty;
--    (* Emit the function epilogue and the DejaGNU scan-assembler directives.  *)
--    emit_epilogue chan features regexps;
--    (* Close the test file.  *)
--    close_out chan
--
--(* Generate tests for one element of the "ops" table.  *)
--let test_intrinsic_group dir (opcode, features, shape, name, munge, types) =
--  List.iter (test_intrinsic dir opcode features shape name munge) types
--
--(* Program entry point.  *)
--let _ =
--  let directory = if Array.length Sys.argv <> 1 then Sys.argv.(1) else "." in
--    List.iter (test_intrinsic_group directory) (reinterp @ reinterpq @ ops)
--
---- a/src/gcc/config/arm/neon.md
-+++ b/src/gcc/config/arm/neon.md
-@@ -406,7 +406,7 @@
-    (match_operand:SI 2 "immediate_operand" "")]
-   "TARGET_NEON"
- {
--  HOST_WIDE_INT elem = (HOST_WIDE_INT) 1 << INTVAL (operands[2]);
-+  HOST_WIDE_INT elem = HOST_WIDE_INT_1 << INTVAL (operands[2]);
-   emit_insn (gen_vec_set<mode>_internal (operands[0], operands[1],
- 					 GEN_INT (elem), operands[0]));
-   DONE;
-@@ -505,6 +505,20 @@
-                     (const_string "neon_add<q>")))]
- )
- 
-+(define_insn "add<mode>3_fp16"
-+  [(set
-+    (match_operand:VH 0 "s_register_operand" "=w")
-+    (plus:VH
-+     (match_operand:VH 1 "s_register_operand" "w")
-+     (match_operand:VH 2 "s_register_operand" "w")))]
-+ "TARGET_NEON_FP16INST"
-+ "vadd.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
-+ [(set (attr "type")
-+   (if_then_else (match_test "<Is_float_mode>")
-+    (const_string "neon_fp_addsub_s<q>")
-+    (const_string "neon_add<q>")))]
-+)
-+
- (define_insn "adddi3_neon"
-   [(set (match_operand:DI 0 "s_register_operand" "=w,?&r,?&r,?w,?&r,?&r,?&r")
-         (plus:DI (match_operand:DI 1 "s_register_operand" "%w,0,0,w,r,0,r")
-@@ -543,6 +557,17 @@
-                     (const_string "neon_sub<q>")))]
- )
- 
-+(define_insn "sub<mode>3_fp16"
-+ [(set
-+   (match_operand:VH 0 "s_register_operand" "=w")
-+   (minus:VH
-+    (match_operand:VH 1 "s_register_operand" "w")
-+    (match_operand:VH 2 "s_register_operand" "w")))]
-+ "TARGET_NEON_FP16INST"
-+ "vsub.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
-+ [(set_attr "type" "neon_sub<q>")]
-+)
-+
- (define_insn "subdi3_neon"
-   [(set (match_operand:DI 0 "s_register_operand" "=w,?&r,?&r,?&r,?w")
-         (minus:DI (match_operand:DI 1 "s_register_operand" "w,0,r,0,w")
-@@ -591,6 +616,16 @@
- 		    (const_string "neon_mla_<V_elem_ch><q>")))]
- )
- 
-+(define_insn "mul<mode>3add<mode>_neon"
-+  [(set (match_operand:VH 0 "s_register_operand" "=w")
-+	(plus:VH (mult:VH (match_operand:VH 2 "s_register_operand" "w")
-+			  (match_operand:VH 3 "s_register_operand" "w"))
-+		  (match_operand:VH 1 "s_register_operand" "0")))]
-+  "TARGET_NEON_FP16INST && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
-+  "vmla.f16\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
-+  [(set_attr "type" "neon_fp_mla_s<q>")]
-+)
-+
- (define_insn "mul<mode>3neg<mode>add<mode>_neon"
-   [(set (match_operand:VDQW 0 "s_register_operand" "=w")
-         (minus:VDQW (match_operand:VDQW 1 "s_register_operand" "0")
-@@ -629,6 +664,19 @@
-   [(set_attr "type" "neon_fp_mla_s<q>")]
- )
- 
-+;; There is limited support for unsafe-math optimizations using the NEON FP16
-+;; arithmetic instructions, so only the intrinsic is currently supported.
-+(define_insn "fma<VH:mode>4_intrinsic"
-+ [(set (match_operand:VH 0 "register_operand" "=w")
-+   (fma:VH
-+    (match_operand:VH 1 "register_operand" "w")
-+    (match_operand:VH 2 "register_operand" "w")
-+    (match_operand:VH 3 "register_operand" "0")))]
-+ "TARGET_NEON_FP16INST"
-+ "vfma.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
-+ [(set_attr "type" "neon_fp_mla_s<q>")]
-+)
-+
- (define_insn "*fmsub<VCVTF:mode>4"
-   [(set (match_operand:VCVTF 0 "register_operand" "=w")
-         (fma:VCVTF (neg:VCVTF (match_operand:VCVTF 1 "register_operand" "w"))
-@@ -640,13 +688,25 @@
- )
- 
- (define_insn "fmsub<VCVTF:mode>4_intrinsic"
--  [(set (match_operand:VCVTF 0 "register_operand" "=w")
--        (fma:VCVTF (neg:VCVTF (match_operand:VCVTF 1 "register_operand" "w"))
--		   (match_operand:VCVTF 2 "register_operand" "w")
--		   (match_operand:VCVTF 3 "register_operand" "0")))]
--  "TARGET_NEON && TARGET_FMA"
--  "vfms%?.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
--  [(set_attr "type" "neon_fp_mla_s<q>")]
-+ [(set (match_operand:VCVTF 0 "register_operand" "=w")
-+   (fma:VCVTF
-+    (neg:VCVTF (match_operand:VCVTF 1 "register_operand" "w"))
-+    (match_operand:VCVTF 2 "register_operand" "w")
-+    (match_operand:VCVTF 3 "register_operand" "0")))]
-+ "TARGET_NEON && TARGET_FMA"
-+ "vfms%?.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
-+ [(set_attr "type" "neon_fp_mla_s<q>")]
-+)
-+
-+(define_insn "fmsub<VH:mode>4_intrinsic"
-+ [(set (match_operand:VH 0 "register_operand" "=w")
-+   (fma:VH
-+    (neg:VH (match_operand:VH 1 "register_operand" "w"))
-+    (match_operand:VH 2 "register_operand" "w")
-+    (match_operand:VH 3 "register_operand" "0")))]
-+ "TARGET_NEON_FP16INST"
-+ "vfms.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
-+ [(set_attr "type" "neon_fp_mla_s<q>")]
- )
- 
- (define_insn "neon_vrint<NEON_VRINT:nvrint_variant><VCVTF:mode>"
-@@ -860,6 +920,44 @@
-   ""
- )
- 
-+(define_insn "<absneg_str><mode>2"
-+  [(set (match_operand:VH 0 "s_register_operand" "=w")
-+    (ABSNEG:VH (match_operand:VH 1 "s_register_operand" "w")))]
-+ "TARGET_NEON_FP16INST"
-+ "v<absneg_str>.<V_s_elem>\t%<V_reg>0, %<V_reg>1"
-+ [(set_attr "type" "neon_abs<q>")]
-+)
-+
-+(define_expand "neon_v<absneg_str><mode>"
-+ [(set
-+   (match_operand:VH 0 "s_register_operand")
-+   (ABSNEG:VH (match_operand:VH 1 "s_register_operand")))]
-+ "TARGET_NEON_FP16INST"
-+{
-+  emit_insn (gen_<absneg_str><mode>2 (operands[0], operands[1]));
-+  DONE;
-+})
-+
-+(define_insn "neon_v<fp16_rnd_str><mode>"
-+  [(set (match_operand:VH 0 "s_register_operand" "=w")
-+    (unspec:VH
-+     [(match_operand:VH 1 "s_register_operand" "w")]
-+     FP16_RND))]
-+ "TARGET_NEON_FP16INST"
-+ "<fp16_rnd_insn>.<V_s_elem>\t%<V_reg>0, %<V_reg>1"
-+ [(set_attr "type" "neon_fp_round_s<q>")]
-+)
-+
-+(define_insn "neon_vrsqrte<mode>"
-+  [(set (match_operand:VH 0 "s_register_operand" "=w")
-+    (unspec:VH
-+     [(match_operand:VH 1 "s_register_operand" "w")]
-+     UNSPEC_VRSQRTE))]
-+  "TARGET_NEON_FP16INST"
-+  "vrsqrte.f16\t%<V_reg>0, %<V_reg>1"
-+ [(set_attr "type" "neon_fp_rsqrte_s<q>")]
-+)
-+
- (define_insn "*umin<mode>3_neon"
-   [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
- 	(umin:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w")
-@@ -1208,16 +1306,133 @@
- 
- ;; Widening operations
- 
-+(define_expand "widen_ssum<mode>3"
-+  [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
-+	(plus:<V_double_width>
-+	 (sign_extend:<V_double_width>
-+	  (match_operand:VQI 1 "s_register_operand" ""))
-+	 (match_operand:<V_double_width> 2 "s_register_operand" "")))]
-+  "TARGET_NEON"
-+  {
-+    machine_mode mode = GET_MODE (operands[1]);
-+    rtx p1, p2;
-+
-+    p1  = arm_simd_vect_par_cnst_half (mode, false);
-+    p2  = arm_simd_vect_par_cnst_half (mode, true);
-+
-+    if (operands[0] != operands[2])
-+      emit_move_insn (operands[0], operands[2]);
-+
-+    emit_insn (gen_vec_sel_widen_ssum_lo<mode><V_half>3 (operands[0],
-+							 operands[1],
-+							 p1,
-+							 operands[0]));
-+    emit_insn (gen_vec_sel_widen_ssum_hi<mode><V_half>3 (operands[0],
-+							 operands[1],
-+							 p2,
-+							 operands[0]));
-+    DONE;
-+  }
-+)
-+
-+(define_insn "vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3"
-+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
-+	(plus:<VW:V_widen>
-+	 (sign_extend:<VW:V_widen>
-+	  (vec_select:VW
-+	   (match_operand:VQI 1 "s_register_operand" "%w")
-+	   (match_operand:VQI 2 "vect_par_constant_low" "")))
-+	 (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
-+  "TARGET_NEON"
-+{
-+  return BYTES_BIG_ENDIAN ?  "vaddw.<V_s_elem>\t%q0, %q3, %f1" :
-+    "vaddw.<V_s_elem>\t%q0, %q3, %e1";
-+}
-+  [(set_attr "type" "neon_add_widen")])
-+
-+(define_insn "vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3"
-+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
-+	(plus:<VW:V_widen>
-+	 (sign_extend:<VW:V_widen>
-+	  (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
-+			 (match_operand:VQI 2 "vect_par_constant_high" "")))
-+	 (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
-+  "TARGET_NEON"
-+{
-+  return BYTES_BIG_ENDIAN ?  "vaddw.<V_s_elem>\t%q0, %q3, %e1" :
-+    "vaddw.<V_s_elem>\t%q0, %q3, %f1";
-+}
-+  [(set_attr "type" "neon_add_widen")])
-+
- (define_insn "widen_ssum<mode>3"
-   [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
--	(plus:<V_widen> (sign_extend:<V_widen>
--			  (match_operand:VW 1 "s_register_operand" "%w"))
--		        (match_operand:<V_widen> 2 "s_register_operand" "w")))]
-+	(plus:<V_widen>
-+	 (sign_extend:<V_widen>
-+	  (match_operand:VW 1 "s_register_operand" "%w"))
-+	 (match_operand:<V_widen> 2 "s_register_operand" "w")))]
-   "TARGET_NEON"
-   "vaddw.<V_s_elem>\t%q0, %q2, %P1"
-   [(set_attr "type" "neon_add_widen")]
- )
- 
-+(define_expand "widen_usum<mode>3"
-+  [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
-+	(plus:<V_double_width>
-+	 (zero_extend:<V_double_width>
-+	  (match_operand:VQI 1 "s_register_operand" ""))
-+	 (match_operand:<V_double_width> 2 "s_register_operand" "")))]
-+  "TARGET_NEON"
-+  {
-+    machine_mode mode = GET_MODE (operands[1]);
-+    rtx p1, p2;
-+
-+    p1  = arm_simd_vect_par_cnst_half (mode, false);
-+    p2  = arm_simd_vect_par_cnst_half (mode, true);
-+
-+    if (operands[0] != operands[2])
-+      emit_move_insn (operands[0], operands[2]);
-+
-+    emit_insn (gen_vec_sel_widen_usum_lo<mode><V_half>3 (operands[0],
-+							 operands[1],
-+							 p1,
-+							 operands[0]));
-+    emit_insn (gen_vec_sel_widen_usum_hi<mode><V_half>3 (operands[0],
-+							 operands[1],
-+							 p2,
-+							 operands[0]));
-+    DONE;
-+  }
-+)
-+
-+(define_insn "vec_sel_widen_usum_lo<VQI:mode><VW:mode>3"
-+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
-+	(plus:<VW:V_widen>
-+	 (zero_extend:<VW:V_widen>
-+	  (vec_select:VW
-+	   (match_operand:VQI 1 "s_register_operand" "%w")
-+	   (match_operand:VQI 2 "vect_par_constant_low" "")))
-+	 (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
-+  "TARGET_NEON"
-+{
-+  return BYTES_BIG_ENDIAN ?  "vaddw.<V_u_elem>\t%q0, %q3, %f1" :
-+    "vaddw.<V_u_elem>\t%q0, %q3, %e1";
-+}
-+  [(set_attr "type" "neon_add_widen")])
-+
-+(define_insn "vec_sel_widen_usum_hi<VQI:mode><VW:mode>3"
-+  [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
-+	(plus:<VW:V_widen>
-+	 (zero_extend:<VW:V_widen>
-+	  (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
-+			 (match_operand:VQI 2 "vect_par_constant_high" "")))
-+	 (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
-+  "TARGET_NEON"
-+{
-+ return BYTES_BIG_ENDIAN ?  "vaddw.<V_u_elem>\t%q0, %q3, %e1" :
-+    "vaddw.<V_u_elem>\t%q0, %q3, %f1";
-+}
-+  [(set_attr "type" "neon_add_widen")])
-+
- (define_insn "widen_usum<mode>3"
-   [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
- 	(plus:<V_widen> (zero_extend:<V_widen>
-@@ -1488,6 +1703,17 @@
-                     (const_string "neon_reduc_add<q>")))]
- )
- 
-+(define_insn "neon_vpaddv4hf"
-+ [(set
-+   (match_operand:V4HF 0 "s_register_operand" "=w")
-+   (unspec:V4HF [(match_operand:V4HF 1 "s_register_operand" "w")
-+		 (match_operand:V4HF 2 "s_register_operand" "w")]
-+    UNSPEC_VPADD))]
-+ "TARGET_NEON_FP16INST"
-+ "vpadd.f16\t%P0, %P1, %P2"
-+ [(set_attr "type" "neon_reduc_add")]
-+)
-+
- (define_insn "neon_vpsmin<mode>"
-   [(set (match_operand:VD 0 "s_register_operand" "=w")
- 	(unspec:VD [(match_operand:VD 1 "s_register_operand" "w")
-@@ -1836,6 +2062,26 @@
-   DONE;
- })
- 
-+(define_expand "neon_vadd<mode>"
-+  [(match_operand:VH 0 "s_register_operand")
-+   (match_operand:VH 1 "s_register_operand")
-+   (match_operand:VH 2 "s_register_operand")]
-+  "TARGET_NEON_FP16INST"
-+{
-+  emit_insn (gen_add<mode>3_fp16 (operands[0], operands[1], operands[2]));
-+  DONE;
-+})
-+
-+(define_expand "neon_vsub<mode>"
-+  [(match_operand:VH 0 "s_register_operand")
-+   (match_operand:VH 1 "s_register_operand")
-+   (match_operand:VH 2 "s_register_operand")]
-+  "TARGET_NEON_FP16INST"
-+{
-+  emit_insn (gen_sub<mode>3_fp16 (operands[0], operands[1], operands[2]));
-+  DONE;
-+})
-+
- ; Note that NEON operations don't support the full IEEE 754 standard: in
- ; particular, denormal values are flushed to zero.  This means that GCC cannot
- ; use those instructions for autovectorization, etc. unless
-@@ -1927,6 +2173,17 @@
-                     (const_string "neon_mul_<V_elem_ch><q>")))]
- )
- 
-+(define_insn "neon_vmulf<mode>"
-+ [(set
-+   (match_operand:VH 0 "s_register_operand" "=w")
-+   (mult:VH
-+    (match_operand:VH 1 "s_register_operand" "w")
-+    (match_operand:VH 2 "s_register_operand" "w")))]
-+  "TARGET_NEON_FP16INST"
-+  "vmul.f16\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
-+ [(set_attr "type" "neon_mul_<VH_elem_ch><q>")]
-+)
-+
- (define_expand "neon_vmla<mode>"
-   [(match_operand:VDQW 0 "s_register_operand" "=w")
-    (match_operand:VDQW 1 "s_register_operand" "0")
-@@ -1955,6 +2212,18 @@
-   DONE;
- })
- 
-+(define_expand "neon_vfma<VH:mode>"
-+  [(match_operand:VH 0 "s_register_operand")
-+   (match_operand:VH 1 "s_register_operand")
-+   (match_operand:VH 2 "s_register_operand")
-+   (match_operand:VH 3 "s_register_operand")]
-+  "TARGET_NEON_FP16INST"
-+{
-+  emit_insn (gen_fma<mode>4_intrinsic (operands[0], operands[2], operands[3],
-+				       operands[1]));
-+  DONE;
-+})
-+
- (define_expand "neon_vfms<VCVTF:mode>"
-   [(match_operand:VCVTF 0 "s_register_operand")
-    (match_operand:VCVTF 1 "s_register_operand")
-@@ -1967,6 +2236,18 @@
-   DONE;
- })
- 
-+(define_expand "neon_vfms<VH:mode>"
-+  [(match_operand:VH 0 "s_register_operand")
-+   (match_operand:VH 1 "s_register_operand")
-+   (match_operand:VH 2 "s_register_operand")
-+   (match_operand:VH 3 "s_register_operand")]
-+  "TARGET_NEON_FP16INST"
-+{
-+  emit_insn (gen_fmsub<mode>4_intrinsic (operands[0], operands[2], operands[3],
-+					 operands[1]));
-+  DONE;
-+})
-+
- ; Used for intrinsics when flag_unsafe_math_optimizations is false.
- 
- (define_insn "neon_vmla<mode>_unspec"
-@@ -2267,6 +2548,72 @@
-   [(set_attr "type" "neon_fp_compare_s<q>")]
- )
- 
-+(define_expand "neon_vc<cmp_op><mode>"
-+ [(match_operand:<V_cmp_result> 0 "s_register_operand")
-+  (neg:<V_cmp_result>
-+   (COMPARISONS:VH
-+    (match_operand:VH 1 "s_register_operand")
-+    (match_operand:VH 2 "reg_or_zero_operand")))]
-+ "TARGET_NEON_FP16INST"
-+{
-+  /* For FP comparisons use UNSPECS unless -funsafe-math-optimizations
-+     are enabled.  */
-+  if (GET_MODE_CLASS (<MODE>mode) == MODE_VECTOR_FLOAT
-+      && !flag_unsafe_math_optimizations)
-+    emit_insn
-+      (gen_neon_vc<cmp_op><mode>_fp16insn_unspec
-+       (operands[0], operands[1], operands[2]));
-+  else
-+    emit_insn
-+      (gen_neon_vc<cmp_op><mode>_fp16insn
-+       (operands[0], operands[1], operands[2]));
-+  DONE;
-+})
-+
-+(define_insn "neon_vc<cmp_op><mode>_fp16insn"
-+ [(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w,w")
-+   (neg:<V_cmp_result>
-+    (COMPARISONS:<V_cmp_result>
-+     (match_operand:VH 1 "s_register_operand" "w,w")
-+     (match_operand:VH 2 "reg_or_zero_operand" "w,Dz"))))]
-+ "TARGET_NEON_FP16INST
-+  && !(GET_MODE_CLASS (<MODE>mode) == MODE_VECTOR_FLOAT
-+  && !flag_unsafe_math_optimizations)"
-+{
-+  char pattern[100];
-+  sprintf (pattern, "vc<cmp_op>.%s%%#<V_sz_elem>\t%%<V_reg>0,"
-+	   " %%<V_reg>1, %s",
-+	   GET_MODE_CLASS (<MODE>mode) == MODE_VECTOR_FLOAT
-+	   ? "f" : "<cmp_type>",
-+	   which_alternative == 0
-+	   ? "%<V_reg>2" : "#0");
-+  output_asm_insn (pattern, operands);
-+  return "";
-+}
-+ [(set (attr "type")
-+   (if_then_else (match_operand 2 "zero_operand")
-+    (const_string "neon_compare_zero<q>")
-+    (const_string "neon_compare<q>")))])
-+
-+(define_insn "neon_vc<cmp_op_unsp><mode>_fp16insn_unspec"
-+ [(set
-+   (match_operand:<V_cmp_result> 0 "s_register_operand" "=w,w")
-+   (unspec:<V_cmp_result>
-+    [(match_operand:VH 1 "s_register_operand" "w,w")
-+     (match_operand:VH 2 "reg_or_zero_operand" "w,Dz")]
-+    NEON_VCMP))]
-+ "TARGET_NEON_FP16INST"
-+{
-+  char pattern[100];
-+  sprintf (pattern, "vc<cmp_op_unsp>.f%%#<V_sz_elem>\t%%<V_reg>0,"
-+	   " %%<V_reg>1, %s",
-+	   which_alternative == 0
-+	   ? "%<V_reg>2" : "#0");
-+  output_asm_insn (pattern, operands);
-+  return "";
-+}
-+ [(set_attr "type" "neon_fp_compare_s<q>")])
-+
- (define_insn "neon_vc<cmp_op>u<mode>"
-   [(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w")
-         (neg:<V_cmp_result>
-@@ -2318,6 +2665,60 @@
-   [(set_attr "type" "neon_fp_compare_s<q>")]
- )
- 
-+(define_expand "neon_vca<cmp_op><mode>"
-+  [(set
-+    (match_operand:<V_cmp_result> 0 "s_register_operand")
-+    (neg:<V_cmp_result>
-+     (GLTE:<V_cmp_result>
-+      (abs:VH (match_operand:VH 1 "s_register_operand"))
-+      (abs:VH (match_operand:VH 2 "s_register_operand")))))]
-+ "TARGET_NEON_FP16INST"
-+{
-+  if (flag_unsafe_math_optimizations)
-+    emit_insn (gen_neon_vca<cmp_op><mode>_fp16insn
-+	       (operands[0], operands[1], operands[2]));
-+  else
-+    emit_insn (gen_neon_vca<cmp_op><mode>_fp16insn_unspec
-+	       (operands[0], operands[1], operands[2]));
-+  DONE;
-+})
-+
-+(define_insn "neon_vca<cmp_op><mode>_fp16insn"
-+  [(set
-+    (match_operand:<V_cmp_result> 0 "s_register_operand" "=w")
-+    (neg:<V_cmp_result>
-+     (GLTE:<V_cmp_result>
-+      (abs:VH (match_operand:VH 1 "s_register_operand" "w"))
-+      (abs:VH (match_operand:VH 2 "s_register_operand" "w")))))]
-+ "TARGET_NEON_FP16INST && flag_unsafe_math_optimizations"
-+ "vac<cmp_op>.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
-+ [(set_attr "type" "neon_fp_compare_s<q>")]
-+)
-+
-+(define_insn "neon_vca<cmp_op_unsp><mode>_fp16insn_unspec"
-+ [(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w")
-+   (unspec:<V_cmp_result>
-+    [(match_operand:VH 1 "s_register_operand" "w")
-+     (match_operand:VH 2 "s_register_operand" "w")]
-+    NEON_VAGLTE))]
-+ "TARGET_NEON"
-+ "vac<cmp_op_unsp>.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
-+ [(set_attr "type" "neon_fp_compare_s<q>")]
-+)
-+
-+(define_expand "neon_vc<cmp_op>z<mode>"
-+ [(set
-+   (match_operand:<V_cmp_result> 0 "s_register_operand")
-+   (COMPARISONS:<V_cmp_result>
-+    (match_operand:VH 1 "s_register_operand")
-+    (const_int 0)))]
-+ "TARGET_NEON_FP16INST"
-+ {
-+  emit_insn (gen_neon_vc<cmp_op><mode> (operands[0], operands[1],
-+					CONST0_RTX (<MODE>mode)));
-+  DONE;
-+})
-+
- (define_insn "neon_vtst<mode>"
-   [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
-         (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
-@@ -2338,6 +2739,16 @@
-   [(set_attr "type" "neon_abd<q>")]
- )
- 
-+(define_insn "neon_vabd<mode>"
-+  [(set (match_operand:VH 0 "s_register_operand" "=w")
-+    (unspec:VH [(match_operand:VH 1 "s_register_operand" "w")
-+		(match_operand:VH 2 "s_register_operand" "w")]
-+     UNSPEC_VABD_F))]
-+ "TARGET_NEON_FP16INST"
-+ "vabd.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
-+  [(set_attr "type" "neon_abd<q>")]
-+)
-+
- (define_insn "neon_vabdf<mode>"
-   [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
-         (unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w")
-@@ -2400,6 +2811,51 @@
-   [(set_attr "type" "neon_fp_minmax_s<q>")]
- )
- 
-+(define_insn "neon_v<maxmin>f<mode>"
-+ [(set (match_operand:VH 0 "s_register_operand" "=w")
-+   (unspec:VH
-+    [(match_operand:VH 1 "s_register_operand" "w")
-+     (match_operand:VH 2 "s_register_operand" "w")]
-+    VMAXMINF))]
-+ "TARGET_NEON_FP16INST"
-+ "v<maxmin>.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
-+ [(set_attr "type" "neon_fp_minmax_s<q>")]
-+)
-+
-+(define_insn "neon_vp<maxmin>fv4hf"
-+ [(set (match_operand:V4HF 0 "s_register_operand" "=w")
-+   (unspec:V4HF
-+    [(match_operand:V4HF 1 "s_register_operand" "w")
-+     (match_operand:V4HF 2 "s_register_operand" "w")]
-+    VPMAXMINF))]
-+ "TARGET_NEON_FP16INST"
-+ "vp<maxmin>.f16\t%P0, %P1, %P2"
-+  [(set_attr "type" "neon_reduc_minmax")]
-+)
-+
-+(define_insn "neon_<fmaxmin_op><mode>"
-+ [(set
-+   (match_operand:VH 0 "s_register_operand" "=w")
-+   (unspec:VH
-+    [(match_operand:VH 1 "s_register_operand" "w")
-+     (match_operand:VH 2 "s_register_operand" "w")]
-+    VMAXMINFNM))]
-+ "TARGET_NEON_FP16INST"
-+ "<fmaxmin_op>.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
-+ [(set_attr "type" "neon_fp_minmax_s<q>")]
-+)
-+
-+;; v<maxmin>nm intrinsics.
-+(define_insn "neon_<fmaxmin_op><mode>"
-+  [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
-+	(unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w")
-+		       (match_operand:VCVTF 2 "s_register_operand" "w")]
-+		       VMAXMINFNM))]
-+  "TARGET_NEON && TARGET_FPU_ARMV8"
-+  "<fmaxmin_op>.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
-+  [(set_attr "type" "neon_fp_minmax_s<q>")]
-+)
-+
- ;; Vector forms for the IEEE-754 fmax()/fmin() functions
- (define_insn "<fmaxmin><mode>3"
-   [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
-@@ -2471,6 +2927,17 @@
-   [(set_attr "type" "neon_fp_recps_s<q>")]
- )
- 
-+(define_insn "neon_vrecps<mode>"
-+  [(set
-+    (match_operand:VH 0 "s_register_operand" "=w")
-+    (unspec:VH [(match_operand:VH 1 "s_register_operand" "w")
-+		(match_operand:VH 2 "s_register_operand" "w")]
-+     UNSPEC_VRECPS))]
-+  "TARGET_NEON_FP16INST"
-+  "vrecps.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
-+  [(set_attr "type" "neon_fp_recps_s<q>")]
-+)
-+
- (define_insn "neon_vrsqrts<mode>"
-   [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
-         (unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w")
-@@ -2481,6 +2948,17 @@
-   [(set_attr "type" "neon_fp_rsqrts_s<q>")]
- )
- 
-+(define_insn "neon_vrsqrts<mode>"
-+  [(set
-+    (match_operand:VH 0 "s_register_operand" "=w")
-+    (unspec:VH [(match_operand:VH 1 "s_register_operand" "w")
-+		 (match_operand:VH 2 "s_register_operand" "w")]
-+     UNSPEC_VRSQRTS))]
-+ "TARGET_NEON_FP16INST"
-+ "vrsqrts.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
-+ [(set_attr "type" "neon_fp_rsqrts_s<q>")]
-+)
-+
- (define_expand "neon_vabs<mode>"
-   [(match_operand:VDQW 0 "s_register_operand" "")
-    (match_operand:VDQW 1 "s_register_operand" "")]
-@@ -2596,6 +3074,15 @@
- })
- 
- (define_insn "neon_vrecpe<mode>"
-+  [(set (match_operand:VH 0 "s_register_operand" "=w")
-+	(unspec:VH [(match_operand:VH 1 "s_register_operand" "w")]
-+		   UNSPEC_VRECPE))]
-+  "TARGET_NEON_FP16INST"
-+  "vrecpe.f16\t%<V_reg>0, %<V_reg>1"
-+  [(set_attr "type" "neon_fp_recpe_s<q>")]
-+)
-+
-+(define_insn "neon_vrecpe<mode>"
-   [(set (match_operand:V32 0 "s_register_operand" "=w")
- 	(unspec:V32 [(match_operand:V32 1 "s_register_operand" "w")]
-                     UNSPEC_VRECPE))]
-@@ -2932,6 +3419,28 @@ if (BYTES_BIG_ENDIAN)
-   [(set_attr "type" "neon_dup<q>")]
- )
- 
-+(define_insn "neon_vdup_lane<mode>_internal"
-+ [(set (match_operand:VH 0 "s_register_operand" "=w")
-+   (vec_duplicate:VH
-+    (vec_select:<V_elem>
-+     (match_operand:<V_double_vector_mode> 1 "s_register_operand" "w")
-+     (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
-+ "TARGET_NEON && TARGET_FP16"
-+{
-+  if (BYTES_BIG_ENDIAN)
-+    {
-+      int elt = INTVAL (operands[2]);
-+      elt = GET_MODE_NUNITS (<V_double_vector_mode>mode) - 1 - elt;
-+      operands[2] = GEN_INT (elt);
-+    }
-+  if (<Is_d_reg>)
-+    return "vdup.<V_sz_elem>\t%P0, %P1[%c2]";
-+  else
-+    return "vdup.<V_sz_elem>\t%q0, %P1[%c2]";
-+}
-+  [(set_attr "type" "neon_dup<q>")]
-+)
-+
- (define_expand "neon_vdup_lane<mode>"
-   [(match_operand:VDQW 0 "s_register_operand" "=w")
-    (match_operand:<V_double_vector_mode> 1 "s_register_operand" "w")
-@@ -2951,6 +3460,25 @@ if (BYTES_BIG_ENDIAN)
-     DONE;
- })
- 
-+(define_expand "neon_vdup_lane<mode>"
-+  [(match_operand:VH 0 "s_register_operand")
-+   (match_operand:<V_double_vector_mode> 1 "s_register_operand")
-+   (match_operand:SI 2 "immediate_operand")]
-+  "TARGET_NEON && TARGET_FP16"
-+{
-+  if (BYTES_BIG_ENDIAN)
-+    {
-+      unsigned int elt = INTVAL (operands[2]);
-+      unsigned int reg_nelts
-+	= 64 / GET_MODE_UNIT_BITSIZE (<V_double_vector_mode>mode);
-+      elt ^= reg_nelts - 1;
-+      operands[2] = GEN_INT (elt);
-+    }
-+  emit_insn (gen_neon_vdup_lane<mode>_internal (operands[0], operands[1],
-+						operands[2]));
-+  DONE;
-+})
-+
- ; Scalar index is ignored, since only zero is valid here.
- (define_expand "neon_vdup_lanedi"
-   [(match_operand:DI 0 "s_register_operand" "=w")
-@@ -3097,6 +3625,28 @@ if (BYTES_BIG_ENDIAN)
-   [(set_attr "type" "neon_fp_cvt_narrow_s_q")]
- )
- 
-+(define_insn "neon_vcvt<sup><mode>"
-+ [(set
-+   (match_operand:<VH_CVTTO> 0 "s_register_operand" "=w")
-+   (unspec:<VH_CVTTO>
-+    [(match_operand:VCVTHI 1 "s_register_operand" "w")]
-+    VCVT_US))]
-+ "TARGET_NEON_FP16INST"
-+ "vcvt.f16.<sup>%#16\t%<V_reg>0, %<V_reg>1"
-+  [(set_attr "type" "neon_int_to_fp_<VH_elem_ch><q>")]
-+)
-+
-+(define_insn "neon_vcvt<sup><mode>"
-+ [(set
-+   (match_operand:<VH_CVTTO> 0 "s_register_operand" "=w")
-+   (unspec:<VH_CVTTO>
-+    [(match_operand:VH 1 "s_register_operand" "w")]
-+    VCVT_US))]
-+ "TARGET_NEON_FP16INST"
-+ "vcvt.<sup>%#16.f16\t%<V_reg>0, %<V_reg>1"
-+  [(set_attr "type" "neon_fp_to_int_<VH_elem_ch><q>")]
-+)
-+
- (define_insn "neon_vcvt<sup>_n<mode>"
-   [(set (match_operand:<V_CVTTO> 0 "s_register_operand" "=w")
- 	(unspec:<V_CVTTO> [(match_operand:VCVTF 1 "s_register_operand" "w")
-@@ -3111,6 +3661,20 @@ if (BYTES_BIG_ENDIAN)
- )
- 
- (define_insn "neon_vcvt<sup>_n<mode>"
-+ [(set (match_operand:<VH_CVTTO> 0 "s_register_operand" "=w")
-+   (unspec:<VH_CVTTO>
-+    [(match_operand:VH 1 "s_register_operand" "w")
-+     (match_operand:SI 2 "immediate_operand" "i")]
-+    VCVT_US_N))]
-+  "TARGET_NEON_FP16INST"
-+{
-+  neon_const_bounds (operands[2], 0, 17);
-+  return "vcvt.<sup>%#16.f16\t%<V_reg>0, %<V_reg>1, %2";
-+}
-+ [(set_attr "type" "neon_fp_to_int_<VH_elem_ch><q>")]
-+)
-+
-+(define_insn "neon_vcvt<sup>_n<mode>"
-   [(set (match_operand:<V_CVTTO> 0 "s_register_operand" "=w")
- 	(unspec:<V_CVTTO> [(match_operand:VCVTI 1 "s_register_operand" "w")
- 			   (match_operand:SI 2 "immediate_operand" "i")]
-@@ -3123,6 +3687,31 @@ if (BYTES_BIG_ENDIAN)
-   [(set_attr "type" "neon_int_to_fp_<V_elem_ch><q>")]
- )
- 
-+(define_insn "neon_vcvt<sup>_n<mode>"
-+ [(set (match_operand:<VH_CVTTO> 0 "s_register_operand" "=w")
-+   (unspec:<VH_CVTTO>
-+    [(match_operand:VCVTHI 1 "s_register_operand" "w")
-+     (match_operand:SI 2 "immediate_operand" "i")]
-+    VCVT_US_N))]
-+ "TARGET_NEON_FP16INST"
-+{
-+  neon_const_bounds (operands[2], 0, 17);
-+  return "vcvt.f16.<sup>%#16\t%<V_reg>0, %<V_reg>1, %2";
-+}
-+ [(set_attr "type" "neon_int_to_fp_<VH_elem_ch><q>")]
-+)
-+
-+(define_insn "neon_vcvt<vcvth_op><sup><mode>"
-+ [(set
-+   (match_operand:<VH_CVTTO> 0 "s_register_operand" "=w")
-+   (unspec:<VH_CVTTO>
-+    [(match_operand:VH 1 "s_register_operand" "w")]
-+    VCVT_HF_US))]
-+ "TARGET_NEON_FP16INST"
-+ "vcvt<vcvth_op>.<sup>%#16.f16\t%<V_reg>0, %<V_reg>1"
-+  [(set_attr "type" "neon_fp_to_int_<VH_elem_ch><q>")]
-+)
-+
- (define_insn "neon_vmovn<mode>"
-   [(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
- 	(unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")]
-@@ -3193,6 +3782,18 @@ if (BYTES_BIG_ENDIAN)
-                    (const_string "neon_mul_<V_elem_ch>_scalar<q>")))]
- )
- 
-+(define_insn "neon_vmul_lane<mode>"
-+  [(set (match_operand:VH 0 "s_register_operand" "=w")
-+	(unspec:VH [(match_operand:VH 1 "s_register_operand" "w")
-+		    (match_operand:V4HF 2 "s_register_operand"
-+		     "<scalar_mul_constraint>")
-+		     (match_operand:SI 3 "immediate_operand" "i")]
-+		     UNSPEC_VMUL_LANE))]
-+  "TARGET_NEON_FP16INST"
-+  "vmul.f16\t%<V_reg>0, %<V_reg>1, %P2[%c3]"
-+  [(set_attr "type" "neon_fp_mul_s_scalar<q>")]
-+)
-+
- (define_insn "neon_vmull<sup>_lane<mode>"
-   [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
- 	(unspec:<V_widen> [(match_operand:VMDI 1 "s_register_operand" "w")
-@@ -3447,6 +4048,19 @@ if (BYTES_BIG_ENDIAN)
-   DONE;
- })
- 
-+(define_expand "neon_vmul_n<mode>"
-+  [(match_operand:VH 0 "s_register_operand")
-+   (match_operand:VH 1 "s_register_operand")
-+   (match_operand:<V_elem> 2 "s_register_operand")]
-+  "TARGET_NEON_FP16INST"
-+{
-+  rtx tmp = gen_reg_rtx (V4HFmode);
-+  emit_insn (gen_neon_vset_lanev4hf (tmp, operands[2], tmp, const0_rtx));
-+  emit_insn (gen_neon_vmul_lane<mode> (operands[0], operands[1], tmp,
-+				       const0_rtx));
-+  DONE;
-+})
-+
- (define_expand "neon_vmulls_n<mode>"
-   [(match_operand:<V_widen> 0 "s_register_operand" "")
-    (match_operand:VMDI 1 "s_register_operand" "")
-@@ -4168,25 +4782,25 @@ if (BYTES_BIG_ENDIAN)
- 
- (define_expand "neon_vtrn<mode>_internal"
-   [(parallel
--    [(set (match_operand:VDQW 0 "s_register_operand" "")
--	  (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "")
--			(match_operand:VDQW 2 "s_register_operand" "")]
-+    [(set (match_operand:VDQWH 0 "s_register_operand")
-+	  (unspec:VDQWH [(match_operand:VDQWH 1 "s_register_operand")
-+			 (match_operand:VDQWH 2 "s_register_operand")]
- 	   UNSPEC_VTRN1))
--     (set (match_operand:VDQW 3 "s_register_operand" "")
--          (unspec:VDQW [(match_dup 1) (match_dup 2)] UNSPEC_VTRN2))])]
-+     (set (match_operand:VDQWH 3 "s_register_operand")
-+	  (unspec:VDQWH [(match_dup 1) (match_dup 2)] UNSPEC_VTRN2))])]
-   "TARGET_NEON"
-   ""
- )
- 
- ;; Note: Different operand numbering to handle tied registers correctly.
- (define_insn "*neon_vtrn<mode>_insn"
--  [(set (match_operand:VDQW 0 "s_register_operand" "=&w")
--        (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0")
--                      (match_operand:VDQW 3 "s_register_operand" "2")]
--                     UNSPEC_VTRN1))
--   (set (match_operand:VDQW 2 "s_register_operand" "=&w")
--         (unspec:VDQW [(match_dup 1) (match_dup 3)]
--                     UNSPEC_VTRN2))]
-+  [(set (match_operand:VDQWH 0 "s_register_operand" "=&w")
-+	(unspec:VDQWH [(match_operand:VDQWH 1 "s_register_operand" "0")
-+		       (match_operand:VDQWH 3 "s_register_operand" "2")]
-+	 UNSPEC_VTRN1))
-+   (set (match_operand:VDQWH 2 "s_register_operand" "=&w")
-+	(unspec:VDQWH [(match_dup 1) (match_dup 3)]
-+	 UNSPEC_VTRN2))]
-   "TARGET_NEON"
-   "vtrn.<V_sz_elem>\t%<V_reg>0, %<V_reg>2"
-   [(set_attr "type" "neon_permute<q>")]
-@@ -4194,25 +4808,25 @@ if (BYTES_BIG_ENDIAN)
- 
- (define_expand "neon_vzip<mode>_internal"
-   [(parallel
--    [(set (match_operand:VDQW 0 "s_register_operand" "")
--	  (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "")
--	  	        (match_operand:VDQW 2 "s_register_operand" "")]
--		       UNSPEC_VZIP1))
--    (set (match_operand:VDQW 3 "s_register_operand" "")
--	 (unspec:VDQW [(match_dup 1) (match_dup 2)] UNSPEC_VZIP2))])]
-+    [(set (match_operand:VDQWH 0 "s_register_operand")
-+	  (unspec:VDQWH [(match_operand:VDQWH 1 "s_register_operand")
-+			 (match_operand:VDQWH 2 "s_register_operand")]
-+	   UNSPEC_VZIP1))
-+    (set (match_operand:VDQWH 3 "s_register_operand")
-+	 (unspec:VDQWH [(match_dup 1) (match_dup 2)] UNSPEC_VZIP2))])]
-   "TARGET_NEON"
-   ""
- )
- 
- ;; Note: Different operand numbering to handle tied registers correctly.
- (define_insn "*neon_vzip<mode>_insn"
--  [(set (match_operand:VDQW 0 "s_register_operand" "=&w")
--        (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0")
--                      (match_operand:VDQW 3 "s_register_operand" "2")]
--                     UNSPEC_VZIP1))
--   (set (match_operand:VDQW 2 "s_register_operand" "=&w")
--        (unspec:VDQW [(match_dup 1) (match_dup 3)]
--                     UNSPEC_VZIP2))]
-+  [(set (match_operand:VDQWH 0 "s_register_operand" "=&w")
-+	(unspec:VDQWH [(match_operand:VDQWH 1 "s_register_operand" "0")
-+		       (match_operand:VDQWH 3 "s_register_operand" "2")]
-+	 UNSPEC_VZIP1))
-+   (set (match_operand:VDQWH 2 "s_register_operand" "=&w")
-+	(unspec:VDQWH [(match_dup 1) (match_dup 3)]
-+	 UNSPEC_VZIP2))]
-   "TARGET_NEON"
-   "vzip.<V_sz_elem>\t%<V_reg>0, %<V_reg>2"
-   [(set_attr "type" "neon_zip<q>")]
-@@ -4220,25 +4834,25 @@ if (BYTES_BIG_ENDIAN)
- 
- (define_expand "neon_vuzp<mode>_internal"
-   [(parallel
--    [(set (match_operand:VDQW 0 "s_register_operand" "")
--	  (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "")
--			(match_operand:VDQW 2 "s_register_operand" "")]
-+    [(set (match_operand:VDQWH 0 "s_register_operand")
-+	  (unspec:VDQWH [(match_operand:VDQWH 1 "s_register_operand")
-+			(match_operand:VDQWH 2 "s_register_operand")]
- 	   UNSPEC_VUZP1))
--     (set (match_operand:VDQW 3 "s_register_operand" "")
--	  (unspec:VDQW [(match_dup 1) (match_dup 2)] UNSPEC_VUZP2))])]
-+     (set (match_operand:VDQWH 3 "s_register_operand" "")
-+	  (unspec:VDQWH [(match_dup 1) (match_dup 2)] UNSPEC_VUZP2))])]
-   "TARGET_NEON"
-   ""
- )
- 
- ;; Note: Different operand numbering to handle tied registers correctly.
- (define_insn "*neon_vuzp<mode>_insn"
--  [(set (match_operand:VDQW 0 "s_register_operand" "=&w")
--        (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0")
--                      (match_operand:VDQW 3 "s_register_operand" "2")]
--                     UNSPEC_VUZP1))
--   (set (match_operand:VDQW 2 "s_register_operand" "=&w")
--        (unspec:VDQW [(match_dup 1) (match_dup 3)]
--                     UNSPEC_VUZP2))]
-+  [(set (match_operand:VDQWH 0 "s_register_operand" "=&w")
-+	(unspec:VDQWH [(match_operand:VDQWH 1 "s_register_operand" "0")
-+		       (match_operand:VDQWH 3 "s_register_operand" "2")]
-+	 UNSPEC_VUZP1))
-+   (set (match_operand:VDQWH 2 "s_register_operand" "=&w")
-+	(unspec:VDQWH [(match_dup 1) (match_dup 3)]
-+	 UNSPEC_VUZP2))]
-   "TARGET_NEON"
-   "vuzp.<V_sz_elem>\t%<V_reg>0, %<V_reg>2"
-   [(set_attr "type" "neon_zip<q>")]
---- a/src/gcc/config/arm/neon.ml
-+++ b/src//dev/null
-@@ -1,2357 +0,0 @@
--(* Common code for ARM NEON header file, documentation and test case
--   generators.
--
--   Copyright (C) 2006-2016 Free Software Foundation, Inc.
--   Contributed by CodeSourcery.
--
--   This file is part of GCC.
--
--   GCC is free software; you can redistribute it and/or modify it under
--   the terms of the GNU General Public License as published by the Free
--   Software Foundation; either version 3, or (at your option) any later
--   version.
--
--   GCC is distributed in the hope that it will be useful, but WITHOUT ANY
--   WARRANTY; without even the implied warranty of MERCHANTABILITY or
--   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
--   for more details.
--
--   You should have received a copy of the GNU General Public License
--   along with GCC; see the file COPYING3.  If not see
--   <http://www.gnu.org/licenses/>.  *)
--
--(* Shorthand types for vector elements.  *)
--type elts = S8 | S16 | S32 | S64 | F16 | F32 | U8 | U16 | U32 | U64 | P8 | P16
--          | P64 | P128 | I8 | I16 | I32 | I64 | B8 | B16 | B32 | B64 | Conv of elts * elts
--          | Cast of elts * elts | NoElts
--
--type eltclass = Signed | Unsigned | Float | Poly | Int | Bits
--	      | ConvClass of eltclass * eltclass | NoType
--
--(* These vector types correspond directly to C types.  *)
--type vectype = T_int8x8    | T_int8x16
--             | T_int16x4   | T_int16x8
--	     | T_int32x2   | T_int32x4
--	     | T_int64x1   | T_int64x2
--	     | T_uint8x8   | T_uint8x16
--	     | T_uint16x4  | T_uint16x8
--	     | T_uint32x2  | T_uint32x4
--	     | T_uint64x1  | T_uint64x2
--	     | T_float16x4
--	     | T_float32x2 | T_float32x4
--	     | T_poly8x8   | T_poly8x16
--	     | T_poly16x4  | T_poly16x8
--	     | T_immediate of int * int
--             | T_int8      | T_int16
--             | T_int32     | T_int64
--             | T_uint8     | T_uint16
--             | T_uint32    | T_uint64
--             | T_poly8     | T_poly16
--             | T_poly64    | T_poly64x1
--             | T_poly64x2  | T_poly128
--             | T_float16   | T_float32
--             | T_arrayof of int * vectype
--             | T_ptrto of vectype | T_const of vectype
--             | T_void      | T_intQI
--             | T_intHI     | T_intSI
--             | T_intDI     | T_intTI
--             | T_floatHF   | T_floatSF
--
--(* The meanings of the following are:
--     TImode : "Tetra", two registers (four words).
--     EImode : "hExa", three registers (six words).
--     OImode : "Octa", four registers (eight words).
--     CImode : "dodeCa", six registers (twelve words).
--     XImode : "heXadeca", eight registers (sixteen words).
--*)
--
--type inttype = B_TImode | B_EImode | B_OImode | B_CImode | B_XImode
--
--type shape_elt = Dreg | Qreg | Corereg | Immed | VecArray of int * shape_elt
--               | PtrTo of shape_elt | CstPtrTo of shape_elt
--	       (* These next ones are used only in the test generator.  *)
--	       | Element_of_dreg	(* Used for "lane" variants.  *)
--	       | Element_of_qreg	(* Likewise.  *)
--	       | All_elements_of_dreg	(* Used for "dup" variants.  *)
--	       | Alternatives of shape_elt list (* Used for multiple valid operands *)
--
--type shape_form = All of int * shape_elt
--                | Long
--		| Long_noreg of shape_elt
--		| Wide
--		| Wide_noreg of shape_elt
--		| Narrow
--                | Long_imm
--                | Narrow_imm
--                | Binary_imm of shape_elt
--                | Use_operands of shape_elt array
--                | By_scalar of shape_elt
--                | Unary_scalar of shape_elt
--                | Wide_lane
--                | Wide_scalar
--                | Pair_result of shape_elt
--
--type arity = Arity0 of vectype
--           | Arity1 of vectype * vectype
--	   | Arity2 of vectype * vectype * vectype
--	   | Arity3 of vectype * vectype * vectype * vectype
--           | Arity4 of vectype * vectype * vectype * vectype * vectype
--
--type vecmode = V8QI | V4HI | V4HF |V2SI | V2SF | DI
--             | V16QI | V8HI | V4SI | V4SF | V2DI | TI
--             | QI | HI | SI | SF
--
--type opcode =
--  (* Binary ops.  *)
--    Vadd
--  | Vmul
--  | Vmla
--  | Vmls
--  | Vfma
--  | Vfms
--  | Vsub
--  | Vceq
--  | Vcge
--  | Vcgt
--  | Vcle
--  | Vclt
--  | Vcage
--  | Vcagt
--  | Vcale
--  | Vcalt
--  | Vtst
--  | Vabd
--  | Vaba
--  | Vmax
--  | Vmin
--  | Vpadd
--  | Vpada
--  | Vpmax
--  | Vpmin
--  | Vrecps
--  | Vrsqrts
--  | Vshl
--  | Vshr_n
--  | Vshl_n
--  | Vsra_n
--  | Vsri
--  | Vsli
--  (* Logic binops.  *)
--  | Vand
--  | Vorr
--  | Veor
--  | Vbic
--  | Vorn
--  | Vbsl
--  (* Ops with scalar.  *)
--  | Vmul_lane
--  | Vmla_lane
--  | Vmls_lane
--  | Vmul_n
--  | Vmla_n
--  | Vmls_n
--  | Vmull_n
--  | Vmull_lane
--  | Vqdmull_n
--  | Vqdmull_lane
--  | Vqdmulh_n
--  | Vqdmulh_lane
--  (* Unary ops.  *)
--  | Vrintn
--  | Vrinta
--  | Vrintp
--  | Vrintm
--  | Vrintz
--  | Vabs
--  | Vneg
--  | Vcls
--  | Vclz
--  | Vcnt
--  | Vrecpe
--  | Vrsqrte
--  | Vmvn
--  (* Vector extract.  *)
--  | Vext
--  (* Reverse elements.  *)
--  | Vrev64
--  | Vrev32
--  | Vrev16
--  (* Transposition ops.  *)
--  | Vtrn
--  | Vzip
--  | Vuzp
--  (* Loads and stores (VLD1/VST1/VLD2...), elements and structures.  *)
--  | Vldx of int
--  | Vstx of int
--  | Vldx_lane of int
--  | Vldx_dup of int
--  | Vstx_lane of int
--  (* Set/extract lanes from a vector.  *)
--  | Vget_lane
--  | Vset_lane
--  (* Initialize vector from bit pattern.  *)
--  | Vcreate
--  (* Set all lanes to same value.  *)
--  | Vdup_n
--  | Vmov_n  (* Is this the same?  *)
--  (* Duplicate scalar to all lanes of vector.  *)
--  | Vdup_lane
--  (* Combine vectors.  *)
--  | Vcombine
--  (* Get quadword high/low parts.  *)
--  | Vget_high
--  | Vget_low
--  (* Convert vectors.  *)
--  | Vcvt
--  | Vcvt_n
--  (* Narrow/lengthen vectors.  *)
--  | Vmovn
--  | Vmovl
--  (* Table lookup.  *)
--  | Vtbl of int
--  | Vtbx of int
--  (* Reinterpret casts.  *)
--  | Vreinterp
--
--let rev_elems revsize elsize nelts _ =
--  let mask = (revsize / elsize) - 1 in
--  let arr = Array.init nelts
--    (fun i -> i lxor mask) in
--  Array.to_list arr
--
--let permute_range i stride nelts increment =
--  let rec build i = function
--    0 -> []
--  | nelts -> i :: (i + stride) :: build (i + increment) (pred nelts) in
--  build i nelts
--
--(* Generate a list of integers suitable for vzip.  *)
--let zip_range i stride nelts = permute_range i stride nelts 1
--
--(* Generate a list of integers suitable for vunzip.  *)
--let uzip_range i stride nelts = permute_range i stride nelts 4
--
--(* Generate a list of integers suitable for trn.  *)
--let trn_range i stride nelts = permute_range i stride nelts 2
--
--let zip_elems _ nelts part =
--  match part with
--    `lo -> zip_range 0 nelts (nelts / 2)
--  | `hi -> zip_range (nelts / 2) nelts (nelts / 2)
--
--let uzip_elems _ nelts part =
--  match part with
--    `lo -> uzip_range 0 2 (nelts / 2)
--  | `hi -> uzip_range 1 2 (nelts / 2)
--
--let trn_elems _ nelts part =
--  match part with
--    `lo -> trn_range 0 nelts (nelts / 2)
--  | `hi -> trn_range 1 nelts (nelts / 2)
--
--(* Features used for documentation, to distinguish between some instruction
--   variants, and to signal special requirements (e.g. swapping arguments).  *)
--
--type features =
--    Halving
--  | Rounding
--  | Saturating
--  | Dst_unsign
--  | High_half
--  | Doubling
--  | Flipped of string  (* Builtin name to use with flipped arguments.  *)
--  | InfoWord  (* Pass an extra word for signage/rounding etc. (always passed
--                 for All _, Long, Wide, Narrow shape_forms.  *)
--    (* Implement builtin as shuffle.  The parameter is a function which returns
--       masks suitable for __builtin_shuffle: arguments are (element size,
--       number of elements, high/low part selector).  *)
--  | Use_shuffle of (int -> int -> [`lo|`hi] -> int list)
--    (* A specification as to the shape of instruction expected upon
--       disassembly, used if it differs from the shape used to build the
--       intrinsic prototype.  Multiple entries in the constructor's argument
--       indicate that the intrinsic expands to more than one assembly
--       instruction, each with a corresponding shape specified here.  *)
--  | Disassembles_as of shape_form list
--  | Builtin_name of string  (* Override the name of the builtin.  *)
--    (* Override the name of the instruction.  If more than one name
--       is specified, it means that the instruction can have any of those
--       names.  *)
--  | Instruction_name of string list
--    (* Mark that the intrinsic yields no instructions, or expands to yield
--       behavior that the test generator cannot test.  *)
--  | No_op
--    (* Mark that the intrinsic has constant arguments that cannot be set
--       to the defaults (zero for pointers and one otherwise) in the test
--       cases.  The function supplied must return the integer to be written
--       into the testcase for the argument number (0-based) supplied to it.  *)
--  | Const_valuator of (int -> int)
--  | Fixed_vector_reg
--  | Fixed_core_reg
--    (* Mark that the intrinsic requires __ARM_FEATURE_string to be defined.  *)
--  | Requires_feature of string
--    (* Mark that the intrinsic requires a particular architecture version.  *)
--  | Requires_arch of int
--    (* Mark that the intrinsic requires a particular bit in __ARM_FP to
--    be set.   *)
--  | Requires_FP_bit of int
--    (* Compiler optimization level for the test.  *)
--  | Compiler_optim of string
--
--exception MixedMode of elts * elts
--
--let rec elt_width = function
--    S8 | U8 | P8 | I8 | B8 -> 8
--  | S16 | U16 | P16 | I16 | B16 | F16 -> 16
--  | S32 | F32 | U32 | I32 | B32 -> 32
--  | S64 | U64 | P64 | I64 | B64 -> 64
--  | P128 -> 128
--  | Conv (a, b) ->
--      let wa = elt_width a and wb = elt_width b in
--      if wa = wb then wa else raise (MixedMode (a, b))
--  | Cast (a, b) -> raise (MixedMode (a, b))
--  | NoElts -> failwith "No elts"
--
--let rec elt_class = function
--    S8 | S16 | S32 | S64 -> Signed
--  | U8 | U16 | U32 | U64 -> Unsigned
--  | P8 | P16 | P64 | P128 -> Poly
--  | F16 | F32 -> Float
--  | I8 | I16 | I32 | I64 -> Int
--  | B8 | B16 | B32 | B64 -> Bits
--  | Conv (a, b) | Cast (a, b) -> ConvClass (elt_class a, elt_class b)
--  | NoElts -> NoType
--
--let elt_of_class_width c w =
--  match c, w with
--    Signed, 8 -> S8
--  | Signed, 16 -> S16
--  | Signed, 32 -> S32
--  | Signed, 64 -> S64
--  | Float, 16 -> F16
--  | Float, 32 -> F32
--  | Unsigned, 8 -> U8
--  | Unsigned, 16 -> U16
--  | Unsigned, 32 -> U32
--  | Unsigned, 64 -> U64
--  | Poly, 8 -> P8
--  | Poly, 16 -> P16
--  | Poly, 64 -> P64
--  | Poly, 128 -> P128
--  | Int, 8 -> I8
--  | Int, 16 -> I16
--  | Int, 32 -> I32
--  | Int, 64 -> I64
--  | Bits, 8 -> B8
--  | Bits, 16 -> B16
--  | Bits, 32 -> B32
--  | Bits, 64 -> B64
--  | _ -> failwith "Bad element type"
--
--(* Return unsigned integer element the same width as argument.  *)
--let unsigned_of_elt elt =
--  elt_of_class_width Unsigned (elt_width elt)
--
--let signed_of_elt elt =
--  elt_of_class_width Signed (elt_width elt)
--
--(* Return untyped bits element the same width as argument.  *)
--let bits_of_elt elt =
--  elt_of_class_width Bits (elt_width elt)
--
--let non_signed_variant = function
--    S8 -> I8
--  | S16 -> I16
--  | S32 -> I32
--  | S64 -> I64
--  | U8 -> I8
--  | U16 -> I16
--  | U32 -> I32
--  | U64 -> I64
--  | x -> x
--
--let poly_unsigned_variant v =
--  let elclass = match elt_class v with
--    Poly -> Unsigned
--  | x -> x in
--  elt_of_class_width elclass (elt_width v)
--
--let widen_elt elt =
--  let w = elt_width elt
--  and c = elt_class elt in
--  elt_of_class_width c (w * 2)
--
--let narrow_elt elt =
--  let w = elt_width elt
--  and c = elt_class elt in
--  elt_of_class_width c (w / 2)
--
--(* If we're trying to find a mode from a "Use_operands" instruction, use the
--   last vector operand as the dominant mode used to invoke the correct builtin.
--   We must stick to this rule in neon.md.  *)
--let find_key_operand operands =
--  let rec scan opno =
--    match operands.(opno) with
--      Qreg -> Qreg
--    | Dreg -> Dreg
--    | VecArray (_, Qreg) -> Qreg
--    | VecArray (_, Dreg) -> Dreg
--    | _ -> scan (opno-1)
--  in
--    scan ((Array.length operands) - 1)
--
--(* Find a vecmode from a shape_elt ELT for an instruction with shape_form
--   SHAPE.  For a Use_operands shape, if ARGPOS is passed then return the mode
--   for the given argument position, else determine which argument to return a
--   mode for automatically.  *)
--
--let rec mode_of_elt ?argpos elt shape =
--  let flt = match elt_class elt with
--    Float | ConvClass(_, Float) -> true | _ -> false in
--  let idx =
--    match elt_width elt with
--      8 -> 0 | 16 -> 1 | 32 -> 2 | 64 -> 3 | 128 -> 4
--    | _ -> failwith "Bad element width"
--  in match shape with
--    All (_, Dreg) | By_scalar Dreg | Pair_result Dreg | Unary_scalar Dreg
--  | Binary_imm Dreg | Long_noreg Dreg | Wide_noreg Dreg ->
--      if flt then
--        [| V8QI; V4HF; V2SF; DI |].(idx)
--      else
--        [| V8QI; V4HI; V2SI; DI |].(idx)
--  | All (_, Qreg) | By_scalar Qreg | Pair_result Qreg | Unary_scalar Qreg
--  | Binary_imm Qreg | Long_noreg Qreg | Wide_noreg Qreg ->
--      [| V16QI; V8HI; if flt then V4SF else V4SI; V2DI; TI|].(idx)
--  | All (_, (Corereg | PtrTo _ | CstPtrTo _)) ->
--      [| QI; HI; if flt then SF else SI; DI |].(idx)
--  | Long | Wide | Wide_lane | Wide_scalar
--  | Long_imm ->
--      [| V8QI; V4HI; V2SI; DI |].(idx)
--  | Narrow | Narrow_imm -> [| V16QI; V8HI; V4SI; V2DI |].(idx)
--  | Use_operands ops ->
--      begin match argpos with
--        None -> mode_of_elt ?argpos elt (All (0, (find_key_operand ops)))
--      | Some pos -> mode_of_elt ?argpos elt (All (0, ops.(pos)))
--      end
--  | _ -> failwith "invalid shape"
--
--(* Modify an element type dependent on the shape of the instruction and the
--   operand number.  *)
--
--let shapemap shape no =
--  let ident = fun x -> x in
--  match shape with
--    All _ | Use_operands _ | By_scalar _ | Pair_result _ | Unary_scalar _
--  | Binary_imm _ -> ident
--  | Long | Long_noreg _ | Wide_scalar | Long_imm ->
--      [| widen_elt; ident; ident |].(no)
--  | Wide | Wide_noreg _ -> [| widen_elt; widen_elt; ident |].(no)
--  | Wide_lane -> [| widen_elt; ident; ident; ident |].(no)
--  | Narrow | Narrow_imm -> [| narrow_elt; ident; ident |].(no)
--
--(* Register type (D/Q) of an operand, based on shape and operand number.  *)
--
--let regmap shape no =
--  match shape with
--    All (_, reg) | Long_noreg reg | Wide_noreg reg -> reg
--  | Long -> [| Qreg; Dreg; Dreg |].(no)
--  | Wide -> [| Qreg; Qreg; Dreg |].(no)
--  | Narrow -> [| Dreg; Qreg; Qreg |].(no)
--  | Wide_lane -> [| Qreg; Dreg; Dreg; Immed |].(no)
--  | Wide_scalar -> [| Qreg; Dreg; Corereg |].(no)
--  | By_scalar reg -> [| reg; reg; Dreg; Immed |].(no)
--  | Unary_scalar reg -> [| reg; Dreg; Immed |].(no)
--  | Pair_result reg -> [| VecArray (2, reg); reg; reg |].(no)
--  | Binary_imm reg -> [| reg; reg; Immed |].(no)
--  | Long_imm -> [| Qreg; Dreg; Immed |].(no)
--  | Narrow_imm -> [| Dreg; Qreg; Immed |].(no)
--  | Use_operands these -> these.(no)
--
--let type_for_elt shape elt no =
--  let elt = (shapemap shape no) elt in
--  let reg = regmap shape no in
--  let rec type_for_reg_elt reg elt =
--    match reg with
--      Dreg ->
--        begin match elt with
--          S8 -> T_int8x8
--        | S16 -> T_int16x4
--        | S32 -> T_int32x2
--        | S64 -> T_int64x1
--        | U8 -> T_uint8x8
--        | U16 -> T_uint16x4
--        | U32 -> T_uint32x2
--        | U64 -> T_uint64x1
--        | P64 -> T_poly64x1
--        | P128 -> T_poly128
--        | F16 -> T_float16x4
--        | F32 -> T_float32x2
--        | P8 -> T_poly8x8
--        | P16 -> T_poly16x4
--        | _ -> failwith "Bad elt type for Dreg"
--        end
--    | Qreg ->
--        begin match elt with
--          S8 -> T_int8x16
--        | S16 -> T_int16x8
--        | S32 -> T_int32x4
--        | S64 -> T_int64x2
--        | U8 -> T_uint8x16
--        | U16 -> T_uint16x8
--        | U32 -> T_uint32x4
--        | U64 -> T_uint64x2
--        | F32 -> T_float32x4
--        | P8 -> T_poly8x16
--        | P16 -> T_poly16x8
--        | P64 -> T_poly64x2
--        | P128 -> T_poly128
--        | _ -> failwith "Bad elt type for Qreg"
--        end
--    | Corereg ->
--        begin match elt with
--          S8 -> T_int8
--        | S16 -> T_int16
--        | S32 -> T_int32
--        | S64 -> T_int64
--        | U8 -> T_uint8
--        | U16 -> T_uint16
--        | U32 -> T_uint32
--        | U64 -> T_uint64
--        | P8 -> T_poly8
--        | P16 -> T_poly16
--        | P64 -> T_poly64
--        | P128 -> T_poly128
--        | F32 -> T_float32
--        | _ -> failwith "Bad elt type for Corereg"
--        end
--    | Immed ->
--        T_immediate (0, 0)
--    | VecArray (num, sub) ->
--        T_arrayof (num, type_for_reg_elt sub elt)
--    | PtrTo x ->
--        T_ptrto (type_for_reg_elt x elt)
--    | CstPtrTo x ->
--        T_ptrto (T_const (type_for_reg_elt x elt))
--    (* Anything else is solely for the use of the test generator.  *)
--    | _ -> assert false
--  in
--    type_for_reg_elt reg elt
--
--(* Return size of a vector type, in bits.  *)
--let vectype_size = function
--    T_int8x8 | T_int16x4 | T_int32x2 | T_int64x1
--  | T_uint8x8 | T_uint16x4 | T_uint32x2 | T_uint64x1
--  | T_float32x2 | T_poly8x8 | T_poly64x1 | T_poly16x4 | T_float16x4 -> 64
--  | T_int8x16 | T_int16x8 | T_int32x4 | T_int64x2
--  | T_uint8x16 | T_uint16x8  | T_uint32x4  | T_uint64x2
--  | T_float32x4 | T_poly8x16 | T_poly64x2 | T_poly16x8 -> 128
--  | _ -> raise Not_found
--
--let inttype_for_array num elttype =
--  let eltsize = vectype_size elttype in
--  let numwords = (num * eltsize) / 32 in
--  match numwords with
--    4 -> B_TImode
--  | 6 -> B_EImode
--  | 8 -> B_OImode
--  | 12 -> B_CImode
--  | 16 -> B_XImode
--  | _ -> failwith ("no int type for size " ^ string_of_int numwords)
--
--(* These functions return pairs of (internal, external) types, where "internal"
--   types are those seen by GCC, and "external" are those seen by the assembler.
--   These types aren't necessarily the same, since the intrinsics can munge more
--   than one C type into each assembler opcode.  *)
--
--let make_sign_invariant func shape elt =
--  let arity, elt' = func shape elt in
--  arity, non_signed_variant elt'
--
--(* Don't restrict any types.  *)
--
--let elts_same make_arity shape elt =
--  let vtype = type_for_elt shape elt in
--  make_arity vtype, elt
--
--(* As sign_invar_*, but when sign matters.  *)
--let elts_same_io_lane =
--  elts_same (fun vtype -> Arity4 (vtype 0, vtype 0, vtype 1, vtype 2, vtype 3))
--
--let elts_same_io =
--  elts_same (fun vtype -> Arity3 (vtype 0, vtype 0, vtype 1, vtype 2))
--
--let elts_same_2_lane =
--  elts_same (fun vtype -> Arity3 (vtype 0, vtype 1, vtype 2, vtype 3))
--
--let elts_same_3 = elts_same_2_lane
--
--let elts_same_2 =
--  elts_same (fun vtype -> Arity2 (vtype 0, vtype 1, vtype 2))
--
--let elts_same_1 =
--  elts_same (fun vtype -> Arity1 (vtype 0, vtype 1))
--
--(* Use for signed/unsigned invariant operations (i.e. where the operation
--   doesn't depend on the sign of the data.  *)
--
--let sign_invar_io_lane = make_sign_invariant elts_same_io_lane
--let sign_invar_io = make_sign_invariant elts_same_io
--let sign_invar_2_lane = make_sign_invariant elts_same_2_lane
--let sign_invar_2 = make_sign_invariant elts_same_2
--let sign_invar_1 = make_sign_invariant elts_same_1
--
--(* Sign-sensitive comparison.  *)
--
--let cmp_sign_matters shape elt =
--  let vtype = type_for_elt shape elt
--  and rtype = type_for_elt shape (unsigned_of_elt elt) 0 in
--  Arity2 (rtype, vtype 1, vtype 2), elt
--
--(* Signed/unsigned invariant comparison.  *)
--
--let cmp_sign_invar shape elt =
--  let shape', elt' = cmp_sign_matters shape elt in
--  let elt'' =
--    match non_signed_variant elt' with
--      P8 -> I8
--    | x -> x
--  in
--    shape', elt''
--
--(* Comparison (VTST) where only the element width matters.  *)
--
--let cmp_bits shape elt =
--  let vtype = type_for_elt shape elt
--  and rtype = type_for_elt shape (unsigned_of_elt elt) 0
--  and bits_only = bits_of_elt elt in
--  Arity2 (rtype, vtype 1, vtype 2), bits_only
--
--let reg_shift shape elt =
--  let vtype = type_for_elt shape elt
--  and op2type = type_for_elt shape (signed_of_elt elt) 2 in
--  Arity2 (vtype 0, vtype 1, op2type), elt
--
--(* Genericised constant-shift type-generating function.  *)
--
--let const_shift mkimm ?arity ?result shape elt =
--  let op2type = (shapemap shape 2) elt in
--  let op2width = elt_width op2type in
--  let op2 = mkimm op2width
--  and op1 = type_for_elt shape elt 1
--  and r_elt =
--    match result with
--      None -> elt
--    | Some restriction -> restriction elt in
--  let rtype = type_for_elt shape r_elt 0 in
--  match arity with
--    None -> Arity2 (rtype, op1, op2), elt
--  | Some mkarity -> mkarity rtype op1 op2, elt
--
--(* Use for immediate right-shifts.  *)
--
--let shift_right shape elt =
--  const_shift (fun imm -> T_immediate (1, imm)) shape elt
--
--let shift_right_acc shape elt =
--  const_shift (fun imm -> T_immediate (1, imm))
--    ~arity:(fun dst op1 op2 -> Arity3 (dst, dst, op1, op2)) shape elt
--
--(* Use for immediate right-shifts when the operation doesn't care about
--   signedness.  *)
--
--let shift_right_sign_invar =
--  make_sign_invariant shift_right
--
--(* Immediate right-shift; result is unsigned even when operand is signed.  *)
--
--let shift_right_to_uns shape elt =
--  const_shift (fun imm -> T_immediate (1, imm)) ~result:unsigned_of_elt
--    shape elt
--
--(* Immediate left-shift.  *)
--
--let shift_left shape elt =
--  const_shift (fun imm -> T_immediate (0, imm - 1)) shape elt
--
--(* Immediate left-shift, unsigned result.  *)
--
--let shift_left_to_uns shape elt =
--  const_shift (fun imm -> T_immediate (0, imm - 1)) ~result:unsigned_of_elt
--    shape elt
--
--(* Immediate left-shift, don't care about signs.  *)
--
--let shift_left_sign_invar =
--  make_sign_invariant shift_left
--
--(* Shift left/right and insert: only element size matters.  *)
--
--let shift_insert shape elt =
--  let arity, elt =
--    const_shift (fun imm -> T_immediate (1, imm))
--    ~arity:(fun dst op1 op2 -> Arity3 (dst, dst, op1, op2)) shape elt in
--  arity, bits_of_elt elt
--
--(* Get/set lane.  *)
--
--let get_lane shape elt =
--  let vtype = type_for_elt shape elt in
--  Arity2 (vtype 0, vtype 1, vtype 2),
--    (match elt with P8 -> U8 | P16 -> U16 | S32 | U32 | F32 -> B32 | x -> x)
--
--let set_lane shape elt =
--  let vtype = type_for_elt shape elt in
--  Arity3 (vtype 0, vtype 1, vtype 2, vtype 3), bits_of_elt elt
--
--let set_lane_notype shape elt =
--  let vtype = type_for_elt shape elt in
--  Arity3 (vtype 0, vtype 1, vtype 2, vtype 3), NoElts
--
--let create_vector shape elt =
--  let vtype = type_for_elt shape U64 1
--  and rtype = type_for_elt shape elt 0 in
--  Arity1 (rtype, vtype), elt
--
--let conv make_arity shape elt =
--  let edest, esrc = match elt with
--    Conv (edest, esrc) | Cast (edest, esrc) -> edest, esrc
--  | _ -> failwith "Non-conversion element in conversion" in
--  let vtype = type_for_elt shape esrc
--  and rtype = type_for_elt shape edest 0 in
--  make_arity rtype vtype, elt
--
--let conv_1 = conv (fun rtype vtype -> Arity1 (rtype, vtype 1))
--let conv_2 = conv (fun rtype vtype -> Arity2 (rtype, vtype 1, vtype 2))
--
--(* Operation has an unsigned result even if operands are signed.  *)
--
--let dst_unsign make_arity shape elt =
--  let vtype = type_for_elt shape elt
--  and rtype = type_for_elt shape (unsigned_of_elt elt) 0 in
--  make_arity rtype vtype, elt
--
--let dst_unsign_1 = dst_unsign (fun rtype vtype -> Arity1 (rtype, vtype 1))
--
--let make_bits_only func shape elt =
--  let arity, elt' = func shape elt in
--  arity, bits_of_elt elt'
--
--(* Extend operation.  *)
--
--let extend shape elt =
--  let vtype = type_for_elt shape elt in
--  Arity3 (vtype 0, vtype 1, vtype 2, vtype 3), bits_of_elt elt
--
--(* Table look-up operations. Operand 2 is signed/unsigned for signed/unsigned
--   integer ops respectively, or unsigned for polynomial ops.  *)
--
--let table mkarity shape elt =
--  let vtype = type_for_elt shape elt in
--  let op2 = type_for_elt shape (poly_unsigned_variant elt) 2 in
--  mkarity vtype op2, bits_of_elt elt
--
--let table_2 = table (fun vtype op2 -> Arity2 (vtype 0, vtype 1, op2))
--let table_io = table (fun vtype op2 -> Arity3 (vtype 0, vtype 0, vtype 1, op2))
--
--(* Operations where only bits matter.  *)
--
--let bits_1 = make_bits_only elts_same_1
--let bits_2 = make_bits_only elts_same_2
--let bits_3 = make_bits_only elts_same_3
--
--(* Store insns.  *)
--let store_1 shape elt =
--  let vtype = type_for_elt shape elt in
--  Arity2 (T_void, vtype 0, vtype 1), bits_of_elt elt
--
--let store_3 shape elt =
--  let vtype = type_for_elt shape elt in
--  Arity3 (T_void, vtype 0, vtype 1, vtype 2), bits_of_elt elt
--
--let make_notype func shape elt =
--  let arity, _ = func shape elt in
--  arity, NoElts
--
--let notype_1 = make_notype elts_same_1
--let notype_2 = make_notype elts_same_2
--let notype_3 = make_notype elts_same_3
--
--(* Bit-select operations (first operand is unsigned int).  *)
--
--let bit_select shape elt =
--  let vtype = type_for_elt shape elt
--  and itype = type_for_elt shape (unsigned_of_elt elt) in
--  Arity3 (vtype 0, itype 1, vtype 2, vtype 3), NoElts
--
--(* Common lists of supported element types.  *)
--
--let s_8_32 = [S8; S16; S32]
--let u_8_32 = [U8; U16; U32]
--let su_8_32 = [S8; S16; S32; U8; U16; U32]
--let su_8_64 = S64 :: U64 :: su_8_32
--let su_16_64 = [S16; S32; S64; U16; U32; U64]
--let pf_su_8_16 = [P8; P16; S8; S16; U8; U16]
--let pf_su_8_32 = P8 :: P16 :: F32 :: su_8_32
--let pf_su_8_64 = P8 :: P16 :: F32 :: su_8_64
--let suf_32 = [S32; U32; F32]
--
--let ops =
--  [
--    (* Addition.  *)
--    Vadd, [], All (3, Dreg), "vadd", sign_invar_2, F32 :: su_8_32;
--    Vadd, [No_op], All (3, Dreg), "vadd", sign_invar_2, [S64; U64];
--    Vadd, [], All (3, Qreg), "vaddQ", sign_invar_2, F32 :: su_8_64;
--    Vadd, [], Long, "vaddl", elts_same_2, su_8_32;
--    Vadd, [], Wide, "vaddw", elts_same_2, su_8_32;
--    Vadd, [Halving], All (3, Dreg), "vhadd", elts_same_2, su_8_32;
--    Vadd, [Halving], All (3, Qreg), "vhaddQ", elts_same_2, su_8_32;
--    Vadd, [Instruction_name ["vrhadd"]; Rounding; Halving],
--      All (3, Dreg), "vRhadd", elts_same_2, su_8_32;
--    Vadd, [Instruction_name ["vrhadd"]; Rounding; Halving],
--      All (3, Qreg), "vRhaddQ", elts_same_2, su_8_32;
--    Vadd, [Saturating], All (3, Dreg), "vqadd", elts_same_2, su_8_64;
--    Vadd, [Saturating], All (3, Qreg), "vqaddQ", elts_same_2, su_8_64;
--    Vadd, [High_half], Narrow, "vaddhn", sign_invar_2, su_16_64;
--    Vadd, [Instruction_name ["vraddhn"]; Rounding; High_half],
--      Narrow, "vRaddhn", sign_invar_2, su_16_64;
--
--    (* Multiplication.  *)
--    Vmul, [], All (3, Dreg), "vmul", sign_invar_2, P8 :: F32 :: su_8_32;
--    Vmul, [], All (3, Qreg), "vmulQ", sign_invar_2, P8 :: F32 :: su_8_32;
--    Vmul, [Saturating; Doubling; High_half], All (3, Dreg), "vqdmulh",
--      elts_same_2, [S16; S32];
--    Vmul, [Saturating; Doubling; High_half], All (3, Qreg), "vqdmulhQ",
--      elts_same_2, [S16; S32];
--    Vmul,
--      [Saturating; Rounding; Doubling; High_half;
--       Instruction_name ["vqrdmulh"]],
--      All (3, Dreg), "vqRdmulh",
--      elts_same_2, [S16; S32];
--    Vmul,
--      [Saturating; Rounding; Doubling; High_half;
--       Instruction_name ["vqrdmulh"]],
--      All (3, Qreg), "vqRdmulhQ",
--      elts_same_2, [S16; S32];
--    Vmul, [], Long, "vmull", elts_same_2, P8 :: su_8_32;
--    Vmul, [Saturating; Doubling], Long, "vqdmull", elts_same_2, [S16; S32];
--
--    (* Multiply-accumulate. *)
--    Vmla, [], All (3, Dreg), "vmla", sign_invar_io, F32 :: su_8_32;
--    Vmla, [], All (3, Qreg), "vmlaQ", sign_invar_io, F32 :: su_8_32;
--    Vmla, [], Long, "vmlal", elts_same_io, su_8_32;
--    Vmla, [Saturating; Doubling], Long, "vqdmlal", elts_same_io, [S16; S32];
--
--    (* Multiply-subtract.  *)
--    Vmls, [], All (3, Dreg), "vmls", sign_invar_io, F32 :: su_8_32;
--    Vmls, [], All (3, Qreg), "vmlsQ", sign_invar_io, F32 :: su_8_32;
--    Vmls, [], Long, "vmlsl", elts_same_io, su_8_32;
--    Vmls, [Saturating; Doubling], Long, "vqdmlsl", elts_same_io, [S16; S32];
--
--    (* Fused-multiply-accumulate. *)
--    Vfma, [Requires_feature "FMA"], All (3, Dreg), "vfma", elts_same_io, [F32];
--    Vfma, [Requires_feature "FMA"], All (3, Qreg), "vfmaQ", elts_same_io, [F32];
--    Vfms, [Requires_feature "FMA"], All (3, Dreg), "vfms", elts_same_io, [F32];
--    Vfms, [Requires_feature "FMA"], All (3, Qreg), "vfmsQ", elts_same_io, [F32];
--
--    (* Round to integral. *)
--    Vrintn, [Builtin_name "vrintn"; Requires_arch 8], Use_operands [| Dreg; Dreg |],
--            "vrndn", elts_same_1, [F32];
--    Vrintn, [Builtin_name "vrintn"; Requires_arch 8], Use_operands [| Qreg; Qreg |],
--            "vrndqn", elts_same_1, [F32];
--    Vrinta, [Builtin_name "vrinta"; Requires_arch 8], Use_operands [| Dreg; Dreg |],
--            "vrnda", elts_same_1, [F32];
--    Vrinta, [Builtin_name "vrinta"; Requires_arch 8], Use_operands [| Qreg; Qreg |],
--            "vrndqa", elts_same_1, [F32];
--    Vrintp, [Builtin_name "vrintp"; Requires_arch 8], Use_operands [| Dreg; Dreg |],
--            "vrndp", elts_same_1, [F32];
--    Vrintp, [Builtin_name "vrintp"; Requires_arch 8], Use_operands [| Qreg; Qreg |],
--            "vrndqp", elts_same_1, [F32];
--    Vrintm, [Builtin_name "vrintm"; Requires_arch 8], Use_operands [| Dreg; Dreg |],
--            "vrndm", elts_same_1, [F32];
--    Vrintm, [Builtin_name "vrintm"; Requires_arch 8], Use_operands [| Qreg; Qreg |],
--            "vrndqm", elts_same_1, [F32];
--    Vrintz, [Builtin_name "vrintz"; Requires_arch 8], Use_operands [| Dreg; Dreg |],
--            "vrnd", elts_same_1, [F32];
--    Vrintz, [Builtin_name "vrintz"; Requires_arch 8], Use_operands [| Qreg; Qreg |],
--            "vrndq", elts_same_1, [F32];
--    (* Subtraction.  *)
--    Vsub, [], All (3, Dreg), "vsub", sign_invar_2, F32 :: su_8_32;
--    Vsub, [No_op], All (3, Dreg), "vsub", sign_invar_2,  [S64; U64];
--    Vsub, [], All (3, Qreg), "vsubQ", sign_invar_2, F32 :: su_8_64;
--    Vsub, [], Long, "vsubl", elts_same_2, su_8_32;
--    Vsub, [], Wide, "vsubw", elts_same_2, su_8_32;
--    Vsub, [Halving], All (3, Dreg), "vhsub", elts_same_2, su_8_32;
--    Vsub, [Halving], All (3, Qreg), "vhsubQ", elts_same_2, su_8_32;
--    Vsub, [Saturating], All (3, Dreg), "vqsub", elts_same_2, su_8_64;
--    Vsub, [Saturating], All (3, Qreg), "vqsubQ", elts_same_2, su_8_64;
--    Vsub, [High_half], Narrow, "vsubhn", sign_invar_2, su_16_64;
--    Vsub, [Instruction_name ["vrsubhn"]; Rounding; High_half],
--      Narrow, "vRsubhn", sign_invar_2, su_16_64;
--
--    (* Comparison, equal.  *)
--    Vceq, [], All (3, Dreg), "vceq", cmp_sign_invar, P8 :: F32 :: su_8_32;
--    Vceq, [], All (3, Qreg), "vceqQ", cmp_sign_invar, P8 :: F32 :: su_8_32;
--
--    (* Comparison, greater-than or equal.  *)
--    Vcge, [], All (3, Dreg), "vcge", cmp_sign_matters, F32 :: s_8_32;
--    Vcge, [Instruction_name ["vcge"]; Builtin_name "vcgeu"],
--      All (3, Dreg), "vcge", cmp_sign_matters,
--      u_8_32;
--    Vcge, [], All (3, Qreg), "vcgeQ", cmp_sign_matters, F32 :: s_8_32;
--    Vcge, [Instruction_name ["vcge"]; Builtin_name "vcgeu"],
--      All (3, Qreg), "vcgeQ", cmp_sign_matters,
--      u_8_32;
--
--    (* Comparison, less-than or equal.  *)
--    Vcle, [Flipped "vcge"], All (3, Dreg), "vcle", cmp_sign_matters,
--      F32 :: s_8_32;
--    Vcle, [Instruction_name ["vcge"]; Flipped "vcgeu"],
--      All (3, Dreg), "vcle", cmp_sign_matters,
--      u_8_32;
--    Vcle, [Instruction_name ["vcge"]; Flipped "vcgeQ"],
--      All (3, Qreg), "vcleQ", cmp_sign_matters,
--      F32 :: s_8_32;
--    Vcle, [Instruction_name ["vcge"]; Flipped "vcgeuQ"],
--      All (3, Qreg), "vcleQ", cmp_sign_matters,
--      u_8_32;
--
--    (* Comparison, greater-than.  *)
--    Vcgt, [], All (3, Dreg), "vcgt", cmp_sign_matters, F32 :: s_8_32;
--    Vcgt, [Instruction_name ["vcgt"]; Builtin_name "vcgtu"],
--      All (3, Dreg), "vcgt", cmp_sign_matters,
--      u_8_32;
--    Vcgt, [], All (3, Qreg), "vcgtQ", cmp_sign_matters, F32 :: s_8_32;
--    Vcgt, [Instruction_name ["vcgt"]; Builtin_name "vcgtu"],
--      All (3, Qreg), "vcgtQ", cmp_sign_matters,
--      u_8_32;
--
--    (* Comparison, less-than.  *)
--    Vclt, [Flipped "vcgt"], All (3, Dreg), "vclt", cmp_sign_matters,
--      F32 :: s_8_32;
--    Vclt, [Instruction_name ["vcgt"]; Flipped "vcgtu"],
--      All (3, Dreg), "vclt", cmp_sign_matters,
--      u_8_32;
--    Vclt, [Instruction_name ["vcgt"]; Flipped "vcgtQ"],
--      All (3, Qreg), "vcltQ", cmp_sign_matters,
--      F32 :: s_8_32;
--    Vclt, [Instruction_name ["vcgt"]; Flipped "vcgtuQ"],
--      All (3, Qreg), "vcltQ", cmp_sign_matters,
--      u_8_32;
--
--    (* Compare absolute greater-than or equal.  *)
--    Vcage, [Instruction_name ["vacge"]],
--      All (3, Dreg), "vcage", cmp_sign_matters, [F32];
--    Vcage, [Instruction_name ["vacge"]],
--      All (3, Qreg), "vcageQ", cmp_sign_matters, [F32];
--
--    (* Compare absolute less-than or equal.  *)
--    Vcale, [Instruction_name ["vacge"]; Flipped "vcage"],
--      All (3, Dreg), "vcale", cmp_sign_matters, [F32];
--    Vcale, [Instruction_name ["vacge"]; Flipped "vcageQ"],
--      All (3, Qreg), "vcaleQ", cmp_sign_matters, [F32];
--
--    (* Compare absolute greater-than or equal.  *)
--    Vcagt, [Instruction_name ["vacgt"]],
--      All (3, Dreg), "vcagt", cmp_sign_matters, [F32];
--    Vcagt, [Instruction_name ["vacgt"]],
--      All (3, Qreg), "vcagtQ", cmp_sign_matters, [F32];
--
--    (* Compare absolute less-than or equal.  *)
--    Vcalt, [Instruction_name ["vacgt"]; Flipped "vcagt"],
--      All (3, Dreg), "vcalt", cmp_sign_matters, [F32];
--    Vcalt, [Instruction_name ["vacgt"]; Flipped "vcagtQ"],
--      All (3, Qreg), "vcaltQ", cmp_sign_matters, [F32];
--
--    (* Test bits.  *)
--    Vtst, [], All (3, Dreg), "vtst", cmp_bits, P8 :: su_8_32;
--    Vtst, [], All (3, Qreg), "vtstQ", cmp_bits, P8 :: su_8_32;
--
--    (* Absolute difference.  *)
--    Vabd, [], All (3, Dreg), "vabd", elts_same_2, F32 :: su_8_32;
--    Vabd, [], All (3, Qreg), "vabdQ", elts_same_2, F32 :: su_8_32;
--    Vabd, [], Long, "vabdl", elts_same_2, su_8_32;
--
--    (* Absolute difference and accumulate.  *)
--    Vaba, [], All (3, Dreg), "vaba", elts_same_io, su_8_32;
--    Vaba, [], All (3, Qreg), "vabaQ", elts_same_io, su_8_32;
--    Vaba, [], Long, "vabal", elts_same_io, su_8_32;
--
--    (* Max.  *)
--    Vmax, [], All (3, Dreg), "vmax", elts_same_2, F32 :: su_8_32;
--    Vmax, [], All (3, Qreg), "vmaxQ", elts_same_2, F32 :: su_8_32;
--
--    (* Min.  *)
--    Vmin, [], All (3, Dreg), "vmin", elts_same_2, F32 :: su_8_32;
--    Vmin, [], All (3, Qreg), "vminQ", elts_same_2, F32 :: su_8_32;
--
--    (* Pairwise add.  *)
--    Vpadd, [], All (3, Dreg), "vpadd", sign_invar_2, F32 :: su_8_32;
--    Vpadd, [], Long_noreg Dreg, "vpaddl", elts_same_1, su_8_32;
--    Vpadd, [], Long_noreg Qreg, "vpaddlQ", elts_same_1, su_8_32;
--
--    (* Pairwise add, widen and accumulate.  *)
--    Vpada, [], Wide_noreg Dreg, "vpadal", elts_same_2, su_8_32;
--    Vpada, [], Wide_noreg Qreg, "vpadalQ", elts_same_2, su_8_32;
--
--    (* Folding maximum, minimum.  *)
--    Vpmax, [], All (3, Dreg), "vpmax", elts_same_2, F32 :: su_8_32;
--    Vpmin, [], All (3, Dreg), "vpmin", elts_same_2, F32 :: su_8_32;
--
--    (* Reciprocal step.  *)
--    Vrecps, [], All (3, Dreg), "vrecps", elts_same_2, [F32];
--    Vrecps, [], All (3, Qreg), "vrecpsQ", elts_same_2, [F32];
--    Vrsqrts, [], All (3, Dreg), "vrsqrts", elts_same_2, [F32];
--    Vrsqrts, [], All (3, Qreg), "vrsqrtsQ", elts_same_2, [F32];
--
--    (* Vector shift left.  *)
--    Vshl, [], All (3, Dreg), "vshl", reg_shift, su_8_64;
--    Vshl, [], All (3, Qreg), "vshlQ", reg_shift, su_8_64;
--    Vshl, [Instruction_name ["vrshl"]; Rounding],
--      All (3, Dreg), "vRshl", reg_shift, su_8_64;
--    Vshl, [Instruction_name ["vrshl"]; Rounding],
--      All (3, Qreg), "vRshlQ", reg_shift, su_8_64;
--    Vshl, [Saturating], All (3, Dreg), "vqshl", reg_shift, su_8_64;
--    Vshl, [Saturating], All (3, Qreg), "vqshlQ", reg_shift, su_8_64;
--    Vshl, [Instruction_name ["vqrshl"]; Saturating; Rounding],
--      All (3, Dreg), "vqRshl", reg_shift, su_8_64;
--    Vshl, [Instruction_name ["vqrshl"]; Saturating; Rounding],
--      All (3, Qreg), "vqRshlQ", reg_shift, su_8_64;
--
--    (* Vector shift right by constant.  *)
--    Vshr_n, [], Binary_imm Dreg, "vshr_n", shift_right, su_8_64;
--    Vshr_n, [], Binary_imm Qreg, "vshrQ_n", shift_right, su_8_64;
--    Vshr_n, [Instruction_name ["vrshr"]; Rounding], Binary_imm Dreg,
--      "vRshr_n", shift_right, su_8_64;
--    Vshr_n, [Instruction_name ["vrshr"]; Rounding], Binary_imm Qreg,
--      "vRshrQ_n", shift_right, su_8_64;
--    Vshr_n, [], Narrow_imm, "vshrn_n", shift_right_sign_invar, su_16_64;
--    Vshr_n, [Instruction_name ["vrshrn"]; Rounding], Narrow_imm, "vRshrn_n",
--      shift_right_sign_invar, su_16_64;
--    Vshr_n, [Saturating], Narrow_imm, "vqshrn_n", shift_right, su_16_64;
--    Vshr_n, [Instruction_name ["vqrshrn"]; Saturating; Rounding], Narrow_imm,
--      "vqRshrn_n", shift_right, su_16_64;
--    Vshr_n, [Saturating; Dst_unsign], Narrow_imm, "vqshrun_n",
--      shift_right_to_uns, [S16; S32; S64];
--    Vshr_n, [Instruction_name ["vqrshrun"]; Saturating; Dst_unsign; Rounding],
--      Narrow_imm, "vqRshrun_n", shift_right_to_uns, [S16; S32; S64];
--
--    (* Vector shift left by constant.  *)
--    Vshl_n, [], Binary_imm Dreg, "vshl_n", shift_left_sign_invar, su_8_64;
--    Vshl_n, [], Binary_imm Qreg, "vshlQ_n", shift_left_sign_invar, su_8_64;
--    Vshl_n, [Saturating], Binary_imm Dreg, "vqshl_n", shift_left, su_8_64;
--    Vshl_n, [Saturating], Binary_imm Qreg, "vqshlQ_n", shift_left, su_8_64;
--    Vshl_n, [Saturating; Dst_unsign], Binary_imm Dreg, "vqshlu_n",
--      shift_left_to_uns, [S8; S16; S32; S64];
--    Vshl_n, [Saturating; Dst_unsign], Binary_imm Qreg, "vqshluQ_n",
--      shift_left_to_uns, [S8; S16; S32; S64];
--    Vshl_n, [], Long_imm, "vshll_n", shift_left, su_8_32;
--
--    (* Vector shift right by constant and accumulate.  *)
--    Vsra_n, [], Binary_imm Dreg, "vsra_n", shift_right_acc, su_8_64;
--    Vsra_n, [], Binary_imm Qreg, "vsraQ_n", shift_right_acc, su_8_64;
--    Vsra_n, [Instruction_name ["vrsra"]; Rounding], Binary_imm Dreg,
--      "vRsra_n", shift_right_acc, su_8_64;
--    Vsra_n, [Instruction_name ["vrsra"]; Rounding], Binary_imm Qreg,
--      "vRsraQ_n", shift_right_acc, su_8_64;
--
--    (* Vector shift right and insert.  *)
--    Vsri, [Requires_feature "CRYPTO"], Use_operands [| Dreg; Dreg; Immed |], "vsri_n", shift_insert,
--      [P64];
--    Vsri, [], Use_operands [| Dreg; Dreg; Immed |], "vsri_n", shift_insert,
--      P8 :: P16 :: su_8_64;
--    Vsri, [Requires_feature "CRYPTO"], Use_operands [| Qreg; Qreg; Immed |], "vsriQ_n", shift_insert,
--      [P64];
--    Vsri, [], Use_operands [| Qreg; Qreg; Immed |], "vsriQ_n", shift_insert,
--      P8 :: P16 :: su_8_64;
--
--    (* Vector shift left and insert.  *)
--    Vsli, [Requires_feature "CRYPTO"], Use_operands [| Dreg; Dreg; Immed |], "vsli_n", shift_insert,
--      [P64];
--    Vsli, [], Use_operands [| Dreg; Dreg; Immed |], "vsli_n", shift_insert,
--      P8 :: P16 :: su_8_64;
--    Vsli, [Requires_feature "CRYPTO"], Use_operands [| Qreg; Qreg; Immed |], "vsliQ_n", shift_insert,
--      [P64];
--    Vsli, [], Use_operands [| Qreg; Qreg; Immed |], "vsliQ_n", shift_insert,
--      P8 :: P16 :: su_8_64;
--
--    (* Absolute value.  *)
--    Vabs, [], All (2, Dreg), "vabs", elts_same_1, [S8; S16; S32; F32];
--    Vabs, [], All (2, Qreg), "vabsQ", elts_same_1, [S8; S16; S32; F32];
--    Vabs, [Saturating], All (2, Dreg), "vqabs", elts_same_1, [S8; S16; S32];
--    Vabs, [Saturating], All (2, Qreg), "vqabsQ", elts_same_1, [S8; S16; S32];
--
--    (* Negate.  *)
--    Vneg, [], All (2, Dreg), "vneg", elts_same_1, [S8; S16; S32; F32];
--    Vneg, [], All (2, Qreg), "vnegQ", elts_same_1, [S8; S16; S32; F32];
--    Vneg, [Saturating], All (2, Dreg), "vqneg", elts_same_1, [S8; S16; S32];
--    Vneg, [Saturating], All (2, Qreg), "vqnegQ", elts_same_1, [S8; S16; S32];
--
--    (* Bitwise not.  *)
--    Vmvn, [], All (2, Dreg), "vmvn", notype_1, P8 :: su_8_32;
--    Vmvn, [], All (2, Qreg), "vmvnQ", notype_1, P8 :: su_8_32;
--
--    (* Count leading sign bits.  *)
--    Vcls, [], All (2, Dreg), "vcls", elts_same_1, [S8; S16; S32];
--    Vcls, [], All (2, Qreg), "vclsQ", elts_same_1, [S8; S16; S32];
--
--    (* Count leading zeros.  *)
--    Vclz, [], All (2, Dreg), "vclz", sign_invar_1, su_8_32;
--    Vclz, [], All (2, Qreg), "vclzQ", sign_invar_1, su_8_32;
--
--    (* Count number of set bits.  *)
--    Vcnt, [], All (2, Dreg), "vcnt", bits_1, [P8; S8; U8];
--    Vcnt, [], All (2, Qreg), "vcntQ", bits_1, [P8; S8; U8];
--
--    (* Reciprocal estimate.  *)
--    Vrecpe, [], All (2, Dreg), "vrecpe", elts_same_1, [U32; F32];
--    Vrecpe, [], All (2, Qreg), "vrecpeQ", elts_same_1, [U32; F32];
--
--    (* Reciprocal square-root estimate.  *)
--    Vrsqrte, [], All (2, Dreg), "vrsqrte", elts_same_1, [U32; F32];
--    Vrsqrte, [], All (2, Qreg), "vrsqrteQ", elts_same_1, [U32; F32];
--
--    (* Get lanes from a vector.  *)
--    Vget_lane,
--      [InfoWord; Disassembles_as [Use_operands [| Corereg; Element_of_dreg |]];
--       Instruction_name ["vmov"]],
--      Use_operands [| Corereg; Dreg; Immed |],
--      "vget_lane", get_lane, pf_su_8_32;
--    Vget_lane,
--      [No_op;
--       InfoWord;
--       Disassembles_as [Use_operands [| Corereg; Corereg; Dreg |]];
--       Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
--      Use_operands [| Corereg; Dreg; Immed |],
--      "vget_lane", notype_2, [S64; U64];
--    Vget_lane,
--      [InfoWord; Disassembles_as [Use_operands [| Corereg; Element_of_dreg |]];
--       Instruction_name ["vmov"]],
--      Use_operands [| Corereg; Qreg; Immed |],
--      "vgetQ_lane", get_lane, pf_su_8_32;
--    Vget_lane,
--      [InfoWord;
--       Disassembles_as [Use_operands [| Corereg; Corereg; Dreg |]];
--       Instruction_name ["vmov"; "fmrrd"]; Const_valuator (fun _ -> 0);
--       Fixed_core_reg],
--      Use_operands [| Corereg; Qreg; Immed |],
--      "vgetQ_lane", notype_2, [S64; U64];
--
--    (* Set lanes in a vector.  *)
--    Vset_lane, [Disassembles_as [Use_operands [| Element_of_dreg; Corereg |]];
--                Instruction_name ["vmov"]],
--      Use_operands [| Dreg; Corereg; Dreg; Immed |], "vset_lane",
--      set_lane, pf_su_8_32;
--    Vset_lane, [No_op;
--                Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]];
--                Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
--      Use_operands [| Dreg; Corereg; Dreg; Immed |], "vset_lane",
--      set_lane_notype, [S64; U64];
--    Vset_lane, [Disassembles_as [Use_operands [| Element_of_dreg; Corereg |]];
--                Instruction_name ["vmov"]],
--      Use_operands [| Qreg; Corereg; Qreg; Immed |], "vsetQ_lane",
--      set_lane, pf_su_8_32;
--    Vset_lane, [Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]];
--                Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
--      Use_operands [| Qreg; Corereg; Qreg; Immed |], "vsetQ_lane",
--      set_lane_notype, [S64; U64];
--
--    (* Create vector from literal bit pattern.  *)
--    Vcreate,
--      [Requires_feature "CRYPTO"; No_op], (* Not really, but it can yield various things that are too
--                                   hard for the test generator at this time.  *)
--      Use_operands [| Dreg; Corereg |], "vcreate", create_vector,
--      [P64];
--    Vcreate,
--      [No_op], (* Not really, but it can yield various things that are too
--                  hard for the test generator at this time.  *)
--      Use_operands [| Dreg; Corereg |], "vcreate", create_vector,
--      pf_su_8_64;
--
--    (* Set all lanes to the same value.  *)
--    Vdup_n,
--      [Disassembles_as [Use_operands [| Dreg;
--                                        Alternatives [ Corereg;
--                                                       Element_of_dreg ] |]]],
--      Use_operands [| Dreg; Corereg |], "vdup_n", bits_1,
--      pf_su_8_32;
--    Vdup_n,
--      [No_op; Requires_feature "CRYPTO";
--       Instruction_name ["vmov"];
--       Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]]],
--      Use_operands [| Dreg; Corereg |], "vdup_n", notype_1,
--      [P64];
--    Vdup_n,
--      [No_op;
--       Instruction_name ["vmov"];
--       Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]]],
--      Use_operands [| Dreg; Corereg |], "vdup_n", notype_1,
--      [S64; U64];
--    Vdup_n,
--      [No_op; Requires_feature "CRYPTO";
--       Disassembles_as [Use_operands [| Qreg;
--                                        Alternatives [ Corereg;
--                                                       Element_of_dreg ] |]]],
--      Use_operands [| Qreg; Corereg |], "vdupQ_n", bits_1,
--      [P64];
--    Vdup_n,
--      [Disassembles_as [Use_operands [| Qreg;
--                                        Alternatives [ Corereg;
--                                                       Element_of_dreg ] |]]],
--      Use_operands [| Qreg; Corereg |], "vdupQ_n", bits_1,
--      pf_su_8_32;
--    Vdup_n,
--      [No_op;
--       Instruction_name ["vmov"];
--       Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |];
--                        Use_operands [| Dreg; Corereg; Corereg |]]],
--      Use_operands [| Qreg; Corereg |], "vdupQ_n", notype_1,
--      [S64; U64];
--
--    (* These are just aliases for the above.  *)
--    Vmov_n,
--      [Builtin_name "vdup_n";
--       Disassembles_as [Use_operands [| Dreg;
--                                        Alternatives [ Corereg;
--                                                       Element_of_dreg ] |]]],
--      Use_operands [| Dreg; Corereg |],
--      "vmov_n", bits_1, pf_su_8_32;
--    Vmov_n,
--      [No_op;
--       Builtin_name "vdup_n";
--       Instruction_name ["vmov"];
--       Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]]],
--      Use_operands [| Dreg; Corereg |],
--      "vmov_n", notype_1, [S64; U64];
--    Vmov_n,
--      [Builtin_name "vdupQ_n";
--       Disassembles_as [Use_operands [| Qreg;
--                                        Alternatives [ Corereg;
--                                                       Element_of_dreg ] |]]],
--      Use_operands [| Qreg; Corereg |],
--      "vmovQ_n", bits_1, pf_su_8_32;
--    Vmov_n,
--      [No_op;
--       Builtin_name "vdupQ_n";
--       Instruction_name ["vmov"];
--       Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |];
--                        Use_operands [| Dreg; Corereg; Corereg |]]],
--      Use_operands [| Qreg; Corereg |],
--      "vmovQ_n", notype_1, [S64; U64];
--
--    (* Duplicate, lane version.  We can't use Use_operands here because the
--       rightmost register (always Dreg) would be picked up by find_key_operand,
--       when we want the leftmost register to be used in this case (otherwise
--       the modes are indistinguishable in neon.md, etc.  *)
--    Vdup_lane,
--      [Disassembles_as [Use_operands [| Dreg; Element_of_dreg |]]],
--      Unary_scalar Dreg, "vdup_lane", bits_2, pf_su_8_32;
--    Vdup_lane,
--      [No_op; Requires_feature "CRYPTO"; Const_valuator (fun _ -> 0)],
--      Unary_scalar Dreg, "vdup_lane", bits_2, [P64];
--    Vdup_lane,
--      [No_op; Const_valuator (fun _ -> 0)],
--      Unary_scalar Dreg, "vdup_lane", bits_2, [S64; U64];
--    Vdup_lane,
--      [Disassembles_as [Use_operands [| Qreg; Element_of_dreg |]]],
--      Unary_scalar Qreg, "vdupQ_lane", bits_2, pf_su_8_32;
--    Vdup_lane,
--      [No_op; Requires_feature "CRYPTO"; Const_valuator (fun _ -> 0)],
--      Unary_scalar Qreg, "vdupQ_lane", bits_2, [P64];
--    Vdup_lane,
--      [No_op; Const_valuator (fun _ -> 0)],
--      Unary_scalar Qreg, "vdupQ_lane", bits_2, [S64; U64];
--
--    (* Combining vectors.  *)
--    Vcombine, [Requires_feature "CRYPTO"; No_op],
--      Use_operands [| Qreg; Dreg; Dreg |], "vcombine", notype_2,
--      [P64];
--    Vcombine, [No_op],
--      Use_operands [| Qreg; Dreg; Dreg |], "vcombine", notype_2,
--      pf_su_8_64;
--
--    (* Splitting vectors.  *)
--    Vget_high, [Requires_feature "CRYPTO"; No_op],
--      Use_operands [| Dreg; Qreg |], "vget_high",
--      notype_1, [P64];
--    Vget_high, [No_op],
--      Use_operands [| Dreg; Qreg |], "vget_high",
--      notype_1, pf_su_8_64;
--    Vget_low, [Instruction_name ["vmov"];
--               Disassembles_as [Use_operands [| Dreg; Dreg |]];
--	       Fixed_vector_reg],
--      Use_operands [| Dreg; Qreg |], "vget_low",
--      notype_1, pf_su_8_32;
--    Vget_low, [Requires_feature "CRYPTO"; No_op],
--      Use_operands [| Dreg; Qreg |], "vget_low",
--      notype_1, [P64];
--    Vget_low, [No_op],
--      Use_operands [| Dreg; Qreg |], "vget_low",
--      notype_1, [S64; U64];
--
--    (* Conversions.  *)
--    Vcvt, [InfoWord], All (2, Dreg), "vcvt", conv_1,
--      [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
--    Vcvt, [InfoWord], All (2, Qreg), "vcvtQ", conv_1,
--      [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
--    Vcvt, [Builtin_name "vcvt" ; Requires_FP_bit 1],
--          Use_operands [| Dreg; Qreg; |], "vcvt", conv_1, [Conv (F16, F32)];
--    Vcvt, [Builtin_name "vcvt" ; Requires_FP_bit 1],
--          Use_operands [| Qreg; Dreg; |], "vcvt", conv_1, [Conv (F32, F16)];
--    Vcvt_n, [InfoWord], Use_operands [| Dreg; Dreg; Immed |], "vcvt_n", conv_2,
--      [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
--    Vcvt_n, [InfoWord], Use_operands [| Qreg; Qreg; Immed |], "vcvtQ_n", conv_2,
--      [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
--
--    (* Move, narrowing.  *)
--    Vmovn, [Disassembles_as [Use_operands [| Dreg; Qreg |]]],
--      Narrow, "vmovn", sign_invar_1, su_16_64;
--    Vmovn, [Disassembles_as [Use_operands [| Dreg; Qreg |]]; Saturating],
--      Narrow, "vqmovn", elts_same_1, su_16_64;
--    Vmovn,
--      [Disassembles_as [Use_operands [| Dreg; Qreg |]]; Saturating; Dst_unsign],
--      Narrow, "vqmovun", dst_unsign_1,
--      [S16; S32; S64];
--
--    (* Move, long.  *)
--    Vmovl, [Disassembles_as [Use_operands [| Qreg; Dreg |]]],
--      Long, "vmovl", elts_same_1, su_8_32;
--
--    (* Table lookup.  *)
--    Vtbl 1,
--      [Instruction_name ["vtbl"];
--       Disassembles_as [Use_operands [| Dreg; VecArray (1, Dreg); Dreg |]]],
--      Use_operands [| Dreg; Dreg; Dreg |], "vtbl1", table_2, [U8; S8; P8];
--    Vtbl 2, [Instruction_name ["vtbl"]],
--      Use_operands [| Dreg; VecArray (2, Dreg); Dreg |], "vtbl2", table_2,
--      [U8; S8; P8];
--    Vtbl 3, [Instruction_name ["vtbl"]],
--      Use_operands [| Dreg; VecArray (3, Dreg); Dreg |], "vtbl3", table_2,
--      [U8; S8; P8];
--    Vtbl 4, [Instruction_name ["vtbl"]],
--      Use_operands [| Dreg; VecArray (4, Dreg); Dreg |], "vtbl4", table_2,
--      [U8; S8; P8];
--
--    (* Extended table lookup.  *)
--    Vtbx 1,
--      [Instruction_name ["vtbx"];
--       Disassembles_as [Use_operands [| Dreg; VecArray (1, Dreg); Dreg |]]],
--      Use_operands [| Dreg; Dreg; Dreg |], "vtbx1", table_io, [U8; S8; P8];
--    Vtbx 2, [Instruction_name ["vtbx"]],
--      Use_operands [| Dreg; VecArray (2, Dreg); Dreg |], "vtbx2", table_io,
--      [U8; S8; P8];
--    Vtbx 3, [Instruction_name ["vtbx"]],
--      Use_operands [| Dreg; VecArray (3, Dreg); Dreg |], "vtbx3", table_io,
--      [U8; S8; P8];
--    Vtbx 4, [Instruction_name ["vtbx"]],
--      Use_operands [| Dreg; VecArray (4, Dreg); Dreg |], "vtbx4", table_io,
--      [U8; S8; P8];
--
--    (* Multiply, lane.  (note: these were undocumented at the time of
--       writing).  *)
--    Vmul_lane, [], By_scalar Dreg, "vmul_lane", sign_invar_2_lane,
--      [S16; S32; U16; U32; F32];
--    Vmul_lane, [], By_scalar Qreg, "vmulQ_lane", sign_invar_2_lane,
--      [S16; S32; U16; U32; F32];
--
--    (* Multiply-accumulate, lane.  *)
--    Vmla_lane, [], By_scalar Dreg, "vmla_lane", sign_invar_io_lane,
--      [S16; S32; U16; U32; F32];
--    Vmla_lane, [], By_scalar Qreg, "vmlaQ_lane", sign_invar_io_lane,
--      [S16; S32; U16; U32; F32];
--    Vmla_lane, [], Wide_lane, "vmlal_lane", elts_same_io_lane,
--      [S16; S32; U16; U32];
--    Vmla_lane, [Saturating; Doubling], Wide_lane, "vqdmlal_lane",
--      elts_same_io_lane, [S16; S32];
--
--    (* Multiply-subtract, lane.  *)
--    Vmls_lane, [], By_scalar Dreg, "vmls_lane", sign_invar_io_lane,
--      [S16; S32; U16; U32; F32];
--    Vmls_lane, [], By_scalar Qreg, "vmlsQ_lane", sign_invar_io_lane,
--      [S16; S32; U16; U32; F32];
--    Vmls_lane, [], Wide_lane, "vmlsl_lane", elts_same_io_lane,
--      [S16; S32; U16; U32];
--    Vmls_lane, [Saturating; Doubling], Wide_lane, "vqdmlsl_lane",
--      elts_same_io_lane, [S16; S32];
--
--    (* Long multiply, lane.  *)
--    Vmull_lane, [],
--      Wide_lane, "vmull_lane", elts_same_2_lane, [S16; S32; U16; U32];
--
--    (* Saturating doubling long multiply, lane.  *)
--    Vqdmull_lane, [Saturating; Doubling],
--      Wide_lane, "vqdmull_lane", elts_same_2_lane, [S16; S32];
--
--    (* Saturating doubling long multiply high, lane.  *)
--    Vqdmulh_lane, [Saturating; Halving],
--      By_scalar Qreg, "vqdmulhQ_lane", elts_same_2_lane, [S16; S32];
--    Vqdmulh_lane, [Saturating; Halving],
--      By_scalar Dreg, "vqdmulh_lane", elts_same_2_lane, [S16; S32];
--    Vqdmulh_lane, [Saturating; Halving; Rounding;
--		   Instruction_name ["vqrdmulh"]],
--      By_scalar Qreg, "vqRdmulhQ_lane", elts_same_2_lane, [S16; S32];
--    Vqdmulh_lane, [Saturating; Halving; Rounding;
--		   Instruction_name ["vqrdmulh"]],
--      By_scalar Dreg, "vqRdmulh_lane", elts_same_2_lane, [S16; S32];
--
--    (* Vector multiply by scalar.  *)
--    Vmul_n, [InfoWord;
--             Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
--             Use_operands [| Dreg; Dreg; Corereg |], "vmul_n",
--      sign_invar_2, [S16; S32; U16; U32; F32];
--    Vmul_n, [InfoWord;
--             Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
--             Use_operands [| Qreg; Qreg; Corereg |], "vmulQ_n",
--      sign_invar_2, [S16; S32; U16; U32; F32];
--
--    (* Vector long multiply by scalar.  *)
--    Vmull_n, [Instruction_name ["vmull"];
--              Disassembles_as [Use_operands [| Qreg; Dreg; Element_of_dreg |]]],
--              Wide_scalar, "vmull_n",
--      elts_same_2, [S16; S32; U16; U32];
--
--    (* Vector saturating doubling long multiply by scalar.  *)
--    Vqdmull_n, [Saturating; Doubling;
--	        Disassembles_as [Use_operands [| Qreg; Dreg;
--						 Element_of_dreg |]]],
--                Wide_scalar, "vqdmull_n",
--      elts_same_2, [S16; S32];
--
--    (* Vector saturating doubling long multiply high by scalar.  *)
--    Vqdmulh_n,
--      [Saturating; Halving; InfoWord;
--       Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
--      Use_operands [| Qreg; Qreg; Corereg |],
--      "vqdmulhQ_n", elts_same_2, [S16; S32];
--    Vqdmulh_n,
--      [Saturating; Halving; InfoWord;
--       Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
--      Use_operands [| Dreg; Dreg; Corereg |],
--      "vqdmulh_n", elts_same_2, [S16; S32];
--    Vqdmulh_n,
--      [Saturating; Halving; Rounding; InfoWord;
--       Instruction_name ["vqrdmulh"];
--       Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
--      Use_operands [| Qreg; Qreg; Corereg |],
--      "vqRdmulhQ_n", elts_same_2, [S16; S32];
--    Vqdmulh_n,
--      [Saturating; Halving; Rounding; InfoWord;
--       Instruction_name ["vqrdmulh"];
--       Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
--      Use_operands [| Dreg; Dreg; Corereg |],
--      "vqRdmulh_n", elts_same_2, [S16; S32];
--
--    (* Vector multiply-accumulate by scalar.  *)
--    Vmla_n, [InfoWord;
--             Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
--      Use_operands [| Dreg; Dreg; Corereg |], "vmla_n",
--      sign_invar_io, [S16; S32; U16; U32; F32];
--    Vmla_n, [InfoWord;
--             Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
--      Use_operands [| Qreg; Qreg; Corereg |], "vmlaQ_n",
--      sign_invar_io, [S16; S32; U16; U32; F32];
--    Vmla_n, [], Wide_scalar, "vmlal_n", elts_same_io, [S16; S32; U16; U32];
--    Vmla_n, [Saturating; Doubling], Wide_scalar, "vqdmlal_n", elts_same_io,
--      [S16; S32];
--
--    (* Vector multiply subtract by scalar.  *)
--    Vmls_n, [InfoWord;
--             Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
--      Use_operands [| Dreg; Dreg; Corereg |], "vmls_n",
--      sign_invar_io, [S16; S32; U16; U32; F32];
--    Vmls_n, [InfoWord;
--             Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
--      Use_operands [| Qreg; Qreg; Corereg |], "vmlsQ_n",
--      sign_invar_io, [S16; S32; U16; U32; F32];
--    Vmls_n, [], Wide_scalar, "vmlsl_n", elts_same_io, [S16; S32; U16; U32];
--    Vmls_n, [Saturating; Doubling], Wide_scalar, "vqdmlsl_n", elts_same_io,
--      [S16; S32];
--
--    (* Vector extract.  *)
--    Vext, [Requires_feature "CRYPTO"; Const_valuator (fun _ -> 0)],
--      Use_operands [| Dreg; Dreg; Dreg; Immed |], "vext", extend,
--      [P64];
--    Vext, [Const_valuator (fun _ -> 0)],
--      Use_operands [| Dreg; Dreg; Dreg; Immed |], "vext", extend,
--      pf_su_8_64;
--    Vext, [Requires_feature "CRYPTO"; Const_valuator (fun _ -> 0)],
--      Use_operands [| Qreg; Qreg; Qreg; Immed |], "vextQ", extend,
--      [P64];
--    Vext, [Const_valuator (fun _ -> 0)],
--      Use_operands [| Qreg; Qreg; Qreg; Immed |], "vextQ", extend,
--      pf_su_8_64;
--
--    (* Reverse elements.  *)
--    Vrev64, [Use_shuffle (rev_elems 64)], All (2, Dreg), "vrev64", bits_1,
--      P8 :: P16 :: F32 :: su_8_32;
--    Vrev64, [Use_shuffle (rev_elems 64)], All (2, Qreg), "vrev64Q", bits_1,
--      P8 :: P16 :: F32 :: su_8_32;
--    Vrev32, [Use_shuffle (rev_elems 32)], All (2, Dreg), "vrev32", bits_1,
--      [P8; P16; S8; U8; S16; U16];
--    Vrev32, [Use_shuffle (rev_elems 32)], All (2, Qreg), "vrev32Q", bits_1,
--      [P8; P16; S8; U8; S16; U16];
--    Vrev16, [Use_shuffle (rev_elems 16)], All (2, Dreg), "vrev16", bits_1,
--      [P8; S8; U8];
--    Vrev16, [Use_shuffle (rev_elems 16)], All (2, Qreg), "vrev16Q", bits_1,
--      [P8; S8; U8];
--
--    (* Bit selection.  *)
--    Vbsl,
--      [Requires_feature "CRYPTO"; Instruction_name ["vbsl"; "vbit"; "vbif"];
--       Disassembles_as [Use_operands [| Dreg; Dreg; Dreg |]]],
--      Use_operands [| Dreg; Dreg; Dreg; Dreg |], "vbsl", bit_select,
--      [P64];
--    Vbsl,
--      [Instruction_name ["vbsl"; "vbit"; "vbif"];
--       Disassembles_as [Use_operands [| Dreg; Dreg; Dreg |]]],
--      Use_operands [| Dreg; Dreg; Dreg; Dreg |], "vbsl", bit_select,
--      pf_su_8_64;
--    Vbsl,
--      [Requires_feature "CRYPTO"; Instruction_name ["vbsl"; "vbit"; "vbif"];
--       Disassembles_as [Use_operands [| Qreg; Qreg; Qreg |]]],
--      Use_operands [| Qreg; Qreg; Qreg; Qreg |], "vbslQ", bit_select,
--      [P64];
--    Vbsl,
--      [Instruction_name ["vbsl"; "vbit"; "vbif"];
--       Disassembles_as [Use_operands [| Qreg; Qreg; Qreg |]]],
--      Use_operands [| Qreg; Qreg; Qreg; Qreg |], "vbslQ", bit_select,
--      pf_su_8_64;
--
--    Vtrn, [Use_shuffle trn_elems], Pair_result Dreg, "vtrn", bits_2, pf_su_8_16;
--    Vtrn, [Use_shuffle trn_elems; Instruction_name ["vuzp"]], Pair_result Dreg, "vtrn", bits_2, suf_32;
--    Vtrn, [Use_shuffle trn_elems], Pair_result Qreg, "vtrnQ", bits_2, pf_su_8_32;
--    (* Zip elements.  *)
--    Vzip, [Use_shuffle zip_elems], Pair_result Dreg, "vzip", bits_2, pf_su_8_16;
--    Vzip, [Use_shuffle zip_elems; Instruction_name ["vuzp"]], Pair_result Dreg, "vzip", bits_2, suf_32;
--    Vzip, [Use_shuffle zip_elems], Pair_result Qreg, "vzipQ", bits_2, pf_su_8_32; 
--
--    (* Unzip elements.  *)
--    Vuzp, [Use_shuffle uzip_elems], Pair_result Dreg, "vuzp", bits_2,
--      pf_su_8_32;
--    Vuzp, [Use_shuffle uzip_elems], Pair_result Qreg, "vuzpQ", bits_2,
--      pf_su_8_32;
--
--    (* Element/structure loads.  VLD1 variants.  *)
--    Vldx 1,
--      [Requires_feature "CRYPTO";
--       Disassembles_as [Use_operands [| VecArray (1, Dreg);
--                                        CstPtrTo Corereg |]]],
--      Use_operands [| Dreg; CstPtrTo Corereg |], "vld1", bits_1,
--      [P64];
--    Vldx 1,
--      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
--                                        CstPtrTo Corereg |]]],
--      Use_operands [| Dreg; CstPtrTo Corereg |], "vld1", bits_1,
--      pf_su_8_64;
--    Vldx 1, [Requires_feature "CRYPTO";
--             Disassembles_as [Use_operands [| VecArray (2, Dreg);
--					      CstPtrTo Corereg |]]],
--      Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q", bits_1,
--      [P64];
--    Vldx 1, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
--					      CstPtrTo Corereg |]]],
--      Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q", bits_1,
--      pf_su_8_64;
--
--    Vldx_lane 1,
--      [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
--                                        CstPtrTo Corereg |]]],
--      Use_operands [| Dreg; CstPtrTo Corereg; Dreg; Immed |],
--      "vld1_lane", bits_3, pf_su_8_32;
--    Vldx_lane 1,
--      [Requires_feature "CRYPTO";
--       Disassembles_as [Use_operands [| VecArray (1, Dreg);
--                                        CstPtrTo Corereg |]];
--       Const_valuator (fun _ -> 0)],
--      Use_operands [| Dreg; CstPtrTo Corereg; Dreg; Immed |],
--      "vld1_lane", bits_3, [P64];
--    Vldx_lane 1,
--      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
--                                        CstPtrTo Corereg |]];
--       Const_valuator (fun _ -> 0)],
--      Use_operands [| Dreg; CstPtrTo Corereg; Dreg; Immed |],
--      "vld1_lane", bits_3, [S64; U64];
--    Vldx_lane 1,
--      [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
--                                        CstPtrTo Corereg |]]],
--      Use_operands [| Qreg; CstPtrTo Corereg; Qreg; Immed |],
--      "vld1Q_lane", bits_3, pf_su_8_32;
--    Vldx_lane 1,
--      [Requires_feature "CRYPTO";
--       Disassembles_as [Use_operands [| VecArray (1, Dreg);
--                                        CstPtrTo Corereg |]]],
--      Use_operands [| Qreg; CstPtrTo Corereg; Qreg; Immed |],
--      "vld1Q_lane", bits_3, [P64];
--    Vldx_lane 1,
--      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
--                                        CstPtrTo Corereg |]]],
--      Use_operands [| Qreg; CstPtrTo Corereg; Qreg; Immed |],
--      "vld1Q_lane", bits_3, [S64; U64];
--
--    Vldx_dup 1,
--      [Disassembles_as [Use_operands [| VecArray (1, All_elements_of_dreg);
--                                        CstPtrTo Corereg |]]],
--      Use_operands [| Dreg; CstPtrTo Corereg |], "vld1_dup",
--      bits_1, pf_su_8_32;
--    Vldx_dup 1,
--      [Requires_feature "CRYPTO";
--       Disassembles_as [Use_operands [| VecArray (1, Dreg);
--                                        CstPtrTo Corereg |]]],
--      Use_operands [| Dreg; CstPtrTo Corereg |], "vld1_dup",
--      bits_1, [P64];
--    Vldx_dup 1,
--      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
--                                        CstPtrTo Corereg |]]],
--      Use_operands [| Dreg; CstPtrTo Corereg |], "vld1_dup",
--      bits_1, [S64; U64];
--    Vldx_dup 1,
--      [Disassembles_as [Use_operands [| VecArray (2, All_elements_of_dreg);
--                                        CstPtrTo Corereg |]]],
--      Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q_dup",
--      bits_1, pf_su_8_32;
--    (* Treated identically to vld1_dup above as we now
--       do a single load followed by a duplicate.  *)
--    Vldx_dup 1,
--      [Requires_feature "CRYPTO";
--       Disassembles_as [Use_operands [| VecArray (1, Dreg);
--                                        CstPtrTo Corereg |]]],
--      Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q_dup",
--      bits_1, [P64];
--    Vldx_dup 1,
--      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
--                                        CstPtrTo Corereg |]]],
--      Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q_dup",
--      bits_1, [S64; U64];
--
--    (* VST1 variants.  *)
--    Vstx 1, [Requires_feature "CRYPTO";
--             Disassembles_as [Use_operands [| VecArray (1, Dreg);
--                                              PtrTo Corereg |]]],
--      Use_operands [| PtrTo Corereg; Dreg |], "vst1",
--      store_1, [P64];
--    Vstx 1, [Disassembles_as [Use_operands [| VecArray (1, Dreg);
--                                              PtrTo Corereg |]]],
--      Use_operands [| PtrTo Corereg; Dreg |], "vst1",
--      store_1, pf_su_8_64;
--    Vstx 1, [Requires_feature "CRYPTO";
--             Disassembles_as [Use_operands [| VecArray (2, Dreg);
--					      PtrTo Corereg |]]],
--      Use_operands [| PtrTo Corereg; Qreg |], "vst1Q",
--      store_1, [P64];
--    Vstx 1, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
--					      PtrTo Corereg |]]],
--      Use_operands [| PtrTo Corereg; Qreg |], "vst1Q",
--      store_1, pf_su_8_64;
--
--    Vstx_lane 1,
--      [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
--                                        CstPtrTo Corereg |]]],
--      Use_operands [| PtrTo Corereg; Dreg; Immed |],
--      "vst1_lane", store_3, pf_su_8_32;
--    Vstx_lane 1,
--      [Requires_feature "CRYPTO";
--       Disassembles_as [Use_operands [| VecArray (1, Dreg);
--                                        CstPtrTo Corereg |]];
--       Const_valuator (fun _ -> 0)],
--      Use_operands [| PtrTo Corereg; Dreg; Immed |],
--      "vst1_lane", store_3, [P64];
--    Vstx_lane 1,
--      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
--                                        CstPtrTo Corereg |]];
--       Const_valuator (fun _ -> 0)],
--      Use_operands [| PtrTo Corereg; Dreg; Immed |],
--      "vst1_lane", store_3, [U64; S64];
--    Vstx_lane 1,
--      [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
--                                        CstPtrTo Corereg |]]],
--      Use_operands [| PtrTo Corereg; Qreg; Immed |],
--      "vst1Q_lane", store_3, pf_su_8_32;
--    Vstx_lane 1,
--      [Requires_feature "CRYPTO";
--       Disassembles_as [Use_operands [| VecArray (1, Dreg);
--                                        CstPtrTo Corereg |]]],
--      Use_operands [| PtrTo Corereg; Qreg; Immed |],
--      "vst1Q_lane", store_3, [P64];
--    Vstx_lane 1,
--      [Disassembles_as [Use_operands [| VecArray (1, Dreg);
--                                        CstPtrTo Corereg |]]],
--      Use_operands [| PtrTo Corereg; Qreg; Immed |],
--      "vst1Q_lane", store_3, [U64; S64];
--
--    (* VLD2 variants.  *)
--    Vldx 2, [], Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
--      "vld2", bits_1, pf_su_8_32;
--    Vldx 2, [Requires_feature "CRYPTO"; Instruction_name ["vld1"]],
--       Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
--      "vld2", bits_1, [P64];
--    Vldx 2, [Instruction_name ["vld1"]],
--       Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
--      "vld2", bits_1, [S64; U64];
--    Vldx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
--                                              CstPtrTo Corereg |];
--                              Use_operands [| VecArray (2, Dreg);
--					      CstPtrTo Corereg |]]],
--      Use_operands [| VecArray (2, Qreg); CstPtrTo Corereg |],
--      "vld2Q", bits_1, pf_su_8_32;
--
--    Vldx_lane 2,
--      [Disassembles_as [Use_operands
--        [| VecArray (2, Element_of_dreg);
--           CstPtrTo Corereg |]]],
--      Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg;
--                      VecArray (2, Dreg); Immed |],
--      "vld2_lane", bits_3, P8 :: P16 :: F32 :: su_8_32;
--    Vldx_lane 2,
--      [Disassembles_as [Use_operands
--        [| VecArray (2, Element_of_dreg);
--           CstPtrTo Corereg |]]],
--      Use_operands [| VecArray (2, Qreg); CstPtrTo Corereg;
-- 	              VecArray (2, Qreg); Immed |],
--      "vld2Q_lane", bits_3, [P16; F32; U16; U32; S16; S32];
--
--    Vldx_dup 2,
--      [Disassembles_as [Use_operands
--        [| VecArray (2, All_elements_of_dreg); CstPtrTo Corereg |]]],
--      Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
--      "vld2_dup", bits_1, pf_su_8_32;
--    Vldx_dup 2,
--      [Requires_feature "CRYPTO";
--       Instruction_name ["vld1"]; Disassembles_as [Use_operands
--        [| VecArray (2, Dreg); CstPtrTo Corereg |]]],
--      Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
--      "vld2_dup", bits_1, [P64];
--    Vldx_dup 2,
--      [Instruction_name ["vld1"]; Disassembles_as [Use_operands
--        [| VecArray (2, Dreg); CstPtrTo Corereg |]]],
--      Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
--      "vld2_dup", bits_1, [S64; U64];
--
--    (* VST2 variants.  *)
--    Vstx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
--                                              PtrTo Corereg |]]],
--      Use_operands [| PtrTo Corereg; VecArray (2, Dreg) |], "vst2",
--      store_1, pf_su_8_32;
--    Vstx 2, [Requires_feature "CRYPTO";
--             Disassembles_as [Use_operands [| VecArray (2, Dreg);
--                                              PtrTo Corereg |]];
--             Instruction_name ["vst1"]],
--      Use_operands [| PtrTo Corereg; VecArray (2, Dreg) |], "vst2",
--      store_1, [P64];
--    Vstx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
--                                              PtrTo Corereg |]];
--             Instruction_name ["vst1"]],
--      Use_operands [| PtrTo Corereg; VecArray (2, Dreg) |], "vst2",
--      store_1, [S64; U64];
--    Vstx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
--					      PtrTo Corereg |];
--                              Use_operands [| VecArray (2, Dreg);
--				              PtrTo Corereg |]]],
--      Use_operands [| PtrTo Corereg; VecArray (2, Qreg) |], "vst2Q",
--      store_1, pf_su_8_32;
--
--    Vstx_lane 2,
--      [Disassembles_as [Use_operands
--        [| VecArray (2, Element_of_dreg);
--           CstPtrTo Corereg |]]],
--      Use_operands [| PtrTo Corereg; VecArray (2, Dreg); Immed |], "vst2_lane",
--      store_3, P8 :: P16 :: F32 :: su_8_32;
--    Vstx_lane 2,
--      [Disassembles_as [Use_operands
--        [| VecArray (2, Element_of_dreg);
--           CstPtrTo Corereg |]]],
--      Use_operands [| PtrTo Corereg; VecArray (2, Qreg); Immed |], "vst2Q_lane",
--      store_3, [P16; F32; U16; U32; S16; S32];
--
--    (* VLD3 variants.  *)
--    Vldx 3, [], Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
--      "vld3", bits_1, pf_su_8_32;
--    Vldx 3, [Requires_feature "CRYPTO"; Instruction_name ["vld1"]],
--      Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
--      "vld3", bits_1, [P64];
--    Vldx 3, [Instruction_name ["vld1"]],
--      Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
--      "vld3", bits_1, [S64; U64];
--    Vldx 3, [Disassembles_as [Use_operands [| VecArray (3, Dreg);
--					      CstPtrTo Corereg |];
--                              Use_operands [| VecArray (3, Dreg);
--					      CstPtrTo Corereg |]]],
--      Use_operands [| VecArray (3, Qreg); CstPtrTo Corereg |],
--      "vld3Q", bits_1, P8 :: P16 :: F32 :: su_8_32;
--
--    Vldx_lane 3,
--      [Disassembles_as [Use_operands
--        [| VecArray (3, Element_of_dreg);
--           CstPtrTo Corereg |]]],
--      Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg;
--                                     VecArray (3, Dreg); Immed |],
--      "vld3_lane", bits_3, P8 :: P16 :: F32 :: su_8_32;
--    Vldx_lane 3,
--      [Disassembles_as [Use_operands
--        [| VecArray (3, Element_of_dreg);
--           CstPtrTo Corereg |]]],
--      Use_operands [| VecArray (3, Qreg); CstPtrTo Corereg;
--				     VecArray (3, Qreg); Immed |],
--      "vld3Q_lane", bits_3, [P16; F32; U16; U32; S16; S32];
--
--    Vldx_dup 3,
--      [Disassembles_as [Use_operands
--        [| VecArray (3, All_elements_of_dreg); CstPtrTo Corereg |]]],
--      Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
--      "vld3_dup", bits_1, pf_su_8_32;
--    Vldx_dup 3,
--      [Requires_feature "CRYPTO";
--       Instruction_name ["vld1"]; Disassembles_as [Use_operands
--        [| VecArray (3, Dreg); CstPtrTo Corereg |]]],
--      Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
--      "vld3_dup", bits_1, [P64];
--    Vldx_dup 3,
--      [Instruction_name ["vld1"]; Disassembles_as [Use_operands
--        [| VecArray (3, Dreg); CstPtrTo Corereg |]]],
--      Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
--      "vld3_dup", bits_1, [S64; U64];
--
--    (* VST3 variants.  *)
--    Vstx 3, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
--                                              PtrTo Corereg |]]],
--      Use_operands [| PtrTo Corereg; VecArray (3, Dreg) |], "vst3",
--      store_1, pf_su_8_32;
--    Vstx 3, [Requires_feature "CRYPTO";
--             Disassembles_as [Use_operands [| VecArray (4, Dreg);
--                                              PtrTo Corereg |]];
--             Instruction_name ["vst1"]],
--      Use_operands [| PtrTo Corereg; VecArray (3, Dreg) |], "vst3",
--      store_1, [P64];
--    Vstx 3, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
--                                              PtrTo Corereg |]];
--             Instruction_name ["vst1"]],
--      Use_operands [| PtrTo Corereg; VecArray (3, Dreg) |], "vst3",
--      store_1, [S64; U64];
--    Vstx 3, [Disassembles_as [Use_operands [| VecArray (3, Dreg);
--					      PtrTo Corereg |];
--                              Use_operands [| VecArray (3, Dreg);
--					      PtrTo Corereg |]]],
--      Use_operands [| PtrTo Corereg; VecArray (3, Qreg) |], "vst3Q",
--      store_1, pf_su_8_32;
--
--    Vstx_lane 3,
--      [Disassembles_as [Use_operands
--        [| VecArray (3, Element_of_dreg);
--           CstPtrTo Corereg |]]],
--      Use_operands [| PtrTo Corereg; VecArray (3, Dreg); Immed |], "vst3_lane",
--      store_3, P8 :: P16 :: F32 :: su_8_32;
--    Vstx_lane 3,
--      [Disassembles_as [Use_operands
--        [| VecArray (3, Element_of_dreg);
--           CstPtrTo Corereg |]]],
--      Use_operands [| PtrTo Corereg; VecArray (3, Qreg); Immed |], "vst3Q_lane",
--      store_3, [P16; F32; U16; U32; S16; S32];
--
--    (* VLD4/VST4 variants.  *)
--    Vldx 4, [], Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
--      "vld4", bits_1, pf_su_8_32;
--    Vldx 4, [Requires_feature "CRYPTO"; Instruction_name ["vld1"]],
--      Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
--      "vld4", bits_1, [P64];
--    Vldx 4, [Instruction_name ["vld1"]],
--      Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
--      "vld4", bits_1, [S64; U64];
--    Vldx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
--					      CstPtrTo Corereg |];
--                              Use_operands [| VecArray (4, Dreg);
--					      CstPtrTo Corereg |]]],
--      Use_operands [| VecArray (4, Qreg); CstPtrTo Corereg |],
--      "vld4Q", bits_1, P8 :: P16 :: F32 :: su_8_32;
--
--    Vldx_lane 4,
--      [Disassembles_as [Use_operands
--        [| VecArray (4, Element_of_dreg);
--           CstPtrTo Corereg |]]],
--      Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg;
--                                     VecArray (4, Dreg); Immed |],
--      "vld4_lane", bits_3, P8 :: P16 :: F32 :: su_8_32;
--    Vldx_lane 4,
--      [Disassembles_as [Use_operands
--        [| VecArray (4, Element_of_dreg);
--           CstPtrTo Corereg |]]],
--      Use_operands [| VecArray (4, Qreg); CstPtrTo Corereg;
--   	              VecArray (4, Qreg); Immed |],
--      "vld4Q_lane", bits_3, [P16; F32; U16; U32; S16; S32];
--
--    Vldx_dup 4,
--      [Disassembles_as [Use_operands
--        [| VecArray (4, All_elements_of_dreg); CstPtrTo Corereg |]]],
--      Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
--      "vld4_dup", bits_1, pf_su_8_32;
--    Vldx_dup 4,
--      [Requires_feature "CRYPTO";
--       Instruction_name ["vld1"]; Disassembles_as [Use_operands
--        [| VecArray (4, Dreg); CstPtrTo Corereg |]]],
--      Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
--      "vld4_dup", bits_1, [P64];
--    Vldx_dup 4,
--      [Instruction_name ["vld1"]; Disassembles_as [Use_operands
--        [| VecArray (4, Dreg); CstPtrTo Corereg |]]],
--      Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
--      "vld4_dup", bits_1, [S64; U64];
--
--    Vstx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
--                                              PtrTo Corereg |]]],
--      Use_operands [| PtrTo Corereg; VecArray (4, Dreg) |], "vst4",
--      store_1, pf_su_8_32;
--    Vstx 4, [Requires_feature "CRYPTO";
--             Disassembles_as [Use_operands [| VecArray (4, Dreg);
--                                              PtrTo Corereg |]];
--             Instruction_name ["vst1"]],
--      Use_operands [| PtrTo Corereg; VecArray (4, Dreg) |], "vst4",
--      store_1, [P64];
--    Vstx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
--                                              PtrTo Corereg |]];
--             Instruction_name ["vst1"]],
--      Use_operands [| PtrTo Corereg; VecArray (4, Dreg) |], "vst4",
--      store_1, [S64; U64];
--    Vstx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
--					      PtrTo Corereg |];
--                              Use_operands [| VecArray (4, Dreg);
--					      PtrTo Corereg |]]],
--     Use_operands [| PtrTo Corereg; VecArray (4, Qreg) |], "vst4Q",
--      store_1, pf_su_8_32;
--
--    Vstx_lane 4,
--      [Disassembles_as [Use_operands
--        [| VecArray (4, Element_of_dreg);
--           CstPtrTo Corereg |]]],
--      Use_operands [| PtrTo Corereg; VecArray (4, Dreg); Immed |], "vst4_lane",
--      store_3, P8 :: P16 :: F32 :: su_8_32;
--    Vstx_lane 4,
--      [Disassembles_as [Use_operands
--        [| VecArray (4, Element_of_dreg);
--           CstPtrTo Corereg |]]],
--      Use_operands [| PtrTo Corereg; VecArray (4, Qreg); Immed |], "vst4Q_lane",
--      store_3, [P16; F32; U16; U32; S16; S32];
--
--    (* Logical operations. And.  *)
--    Vand, [], All (3, Dreg), "vand", notype_2, su_8_32;
--    Vand, [No_op], All (3, Dreg), "vand", notype_2, [S64; U64];
--    Vand, [], All (3, Qreg), "vandQ", notype_2, su_8_64;
--
--    (* Or.  *)
--    Vorr, [], All (3, Dreg), "vorr", notype_2, su_8_32;
--    Vorr, [No_op], All (3, Dreg), "vorr", notype_2, [S64; U64];
--    Vorr, [], All (3, Qreg), "vorrQ", notype_2, su_8_64;
--
--    (* Eor.  *)
--    Veor, [], All (3, Dreg), "veor", notype_2, su_8_32;
--    Veor, [No_op], All (3, Dreg), "veor", notype_2, [S64; U64];
--    Veor, [], All (3, Qreg), "veorQ", notype_2, su_8_64;
--
--    (* Bic (And-not).  *)
--    Vbic, [Compiler_optim "-O2"], All (3, Dreg), "vbic", notype_2, su_8_32;
--    Vbic, [No_op; Compiler_optim "-O2"], All (3, Dreg), "vbic", notype_2, [S64; U64];
--    Vbic, [Compiler_optim "-O2"], All (3, Qreg), "vbicQ", notype_2, su_8_64;
--
--    (* Or-not.  *)
--    Vorn, [Compiler_optim "-O2"], All (3, Dreg), "vorn", notype_2, su_8_32;
--    Vorn, [No_op; Compiler_optim "-O2"], All (3, Dreg), "vorn", notype_2, [S64; U64];
--    Vorn, [Compiler_optim "-O2"], All (3, Qreg), "vornQ", notype_2, su_8_64;
--  ]
--
--let type_in_crypto_only t
--  = (t == P64) || (t == P128)
--
--let cross_product s1 s2
--  = List.filter (fun (e, e') -> e <> e')
--                (List.concat (List.map (fun e1 -> List.map (fun e2 -> (e1,e2)) s1) s2))
--
--let reinterp =
--  let elems = P8 :: P16 :: F32 :: P64 :: su_8_64 in
--  let casts = cross_product elems elems in
--  List.map
--    (fun (convto, convfrom) ->
--       Vreinterp, (if (type_in_crypto_only convto) || (type_in_crypto_only convfrom)
--                   then [Requires_feature "CRYPTO"] else []) @ [No_op], Use_operands [| Dreg; Dreg |],
--                   "vreinterpret", conv_1, [Cast (convto, convfrom)])
--    casts
--
--let reinterpq =
--  let elems = P8 :: P16 :: F32 :: P64 :: P128 :: su_8_64 in
--  let casts = cross_product elems elems in
--  List.map
--    (fun (convto, convfrom) ->
--       Vreinterp, (if (type_in_crypto_only convto) || (type_in_crypto_only convfrom)
--                   then [Requires_feature "CRYPTO"] else []) @ [No_op], Use_operands [| Qreg; Qreg |],
--                   "vreinterpretQ", conv_1, [Cast (convto, convfrom)])
--    casts
--
--(* Output routines.  *)
--
--let rec string_of_elt = function
--    S8 -> "s8" | S16 -> "s16" | S32 -> "s32" | S64 -> "s64"
--  | U8 -> "u8" | U16 -> "u16" | U32 -> "u32" | U64 -> "u64"
--  | I8 -> "i8" | I16 -> "i16" | I32 -> "i32" | I64 -> "i64"
--  | B8 -> "8" | B16 -> "16" | B32 -> "32" | B64 -> "64"
--  | F16 -> "f16" | F32 -> "f32" | P8 -> "p8" | P16 -> "p16"
--  | P64 -> "p64" | P128 -> "p128"
--  | Conv (a, b) | Cast (a, b) -> string_of_elt a ^ "_" ^ string_of_elt b
--  | NoElts -> failwith "No elts"
--
--let string_of_elt_dots elt =
--  match elt with
--    Conv (a, b) | Cast (a, b) -> string_of_elt a ^ "." ^ string_of_elt b
--  | _ -> string_of_elt elt
--
--let string_of_vectype vt =
--  let rec name affix = function
--    T_int8x8 -> affix "int8x8"
--  | T_int8x16 -> affix "int8x16"
--  | T_int16x4 -> affix "int16x4"
--  | T_int16x8 -> affix "int16x8"
--  | T_int32x2 -> affix "int32x2"
--  | T_int32x4 -> affix "int32x4"
--  | T_int64x1 -> affix "int64x1"
--  | T_int64x2 -> affix "int64x2"
--  | T_uint8x8 -> affix "uint8x8"
--  | T_uint8x16 -> affix "uint8x16"
--  | T_uint16x4 -> affix "uint16x4"
--  | T_uint16x8 -> affix "uint16x8"
--  | T_uint32x2 -> affix "uint32x2"
--  | T_uint32x4 -> affix "uint32x4"
--  | T_uint64x1 -> affix "uint64x1"
--  | T_uint64x2 -> affix "uint64x2"
--  | T_float16x4 -> affix "float16x4"
--  | T_float32x2 -> affix "float32x2"
--  | T_float32x4 -> affix "float32x4"
--  | T_poly8x8 -> affix "poly8x8"
--  | T_poly8x16 -> affix "poly8x16"
--  | T_poly16x4 -> affix "poly16x4"
--  | T_poly16x8 -> affix "poly16x8"
--  | T_int8 -> affix "int8"
--  | T_int16 -> affix "int16"
--  | T_int32 -> affix "int32"
--  | T_int64 -> affix "int64"
--  | T_uint8 -> affix "uint8"
--  | T_uint16 -> affix "uint16"
--  | T_uint32 -> affix "uint32"
--  | T_uint64 -> affix "uint64"
--  | T_poly8 -> affix "poly8"
--  | T_poly16 -> affix "poly16"
--  | T_poly64 -> affix "poly64"
--  | T_poly64x1 -> affix "poly64x1"
--  | T_poly64x2 -> affix "poly64x2"
--  | T_poly128 -> affix "poly128"
--  | T_float16 -> affix "float16"
--  | T_float32 -> affix "float32"
--  | T_immediate _ -> "const int"
--  | T_void -> "void"
--  | T_intQI -> "__builtin_neon_qi"
--  | T_intHI -> "__builtin_neon_hi"
--  | T_intSI -> "__builtin_neon_si"
--  | T_intDI -> "__builtin_neon_di"
--  | T_intTI -> "__builtin_neon_ti"
--  | T_floatHF -> "__builtin_neon_hf"
--  | T_floatSF -> "__builtin_neon_sf"
--  | T_arrayof (num, base) ->
--      let basename = name (fun x -> x) base in
--      affix (Printf.sprintf "%sx%d" basename num)
--  | T_ptrto x ->
--      let basename = name affix x in
--      Printf.sprintf "%s *" basename
--  | T_const x ->
--      let basename = name affix x in
--      Printf.sprintf "const %s" basename
--  in
--    name (fun x -> x ^ "_t") vt
--
--let string_of_inttype = function
--    B_TImode -> "__builtin_neon_ti"
--  | B_EImode -> "__builtin_neon_ei"
--  | B_OImode -> "__builtin_neon_oi"
--  | B_CImode -> "__builtin_neon_ci"
--  | B_XImode -> "__builtin_neon_xi"
--
--let string_of_mode = function
--    V8QI -> "v8qi" | V4HI -> "v4hi" | V4HF  -> "v4hf"  | V2SI -> "v2si"
--  | V2SF -> "v2sf" | DI   -> "di"   | V16QI -> "v16qi" | V8HI -> "v8hi"
--  | V4SI -> "v4si" | V4SF -> "v4sf" | V2DI  -> "v2di"  | QI   -> "qi"
--  | HI -> "hi" | SI -> "si" | SF -> "sf" | TI -> "ti"
--
--(* Use uppercase chars for letters which form part of the intrinsic name, but
--   should be omitted from the builtin name (the info is passed in an extra
--   argument, instead).  *)
--let intrinsic_name name = String.lowercase name
--
--(* Allow the name of the builtin to be overridden by things (e.g. Flipped)
--   found in the features list.  *)
--let builtin_name features name =
--  let name = List.fold_right
--               (fun el name ->
--                 match el with
--                   Flipped x | Builtin_name x -> x
--                 | _ -> name)
--               features name in
--  let islower x = let str = String.make 1 x in (String.lowercase str) = str
--  and buf = Buffer.create (String.length name) in
--  String.iter (fun c -> if islower c then Buffer.add_char buf c) name;
--  Buffer.contents buf
--
--(* Transform an arity into a list of strings.  *)
--let strings_of_arity a =
--  match a with
--  | Arity0 vt -> [string_of_vectype vt]
--  | Arity1 (vt1, vt2) -> [string_of_vectype vt1; string_of_vectype vt2]
--  | Arity2 (vt1, vt2, vt3) -> [string_of_vectype vt1;
--			       string_of_vectype vt2;
--                               string_of_vectype vt3]
--  | Arity3 (vt1, vt2, vt3, vt4) -> [string_of_vectype vt1;
--                                    string_of_vectype vt2;
--                                    string_of_vectype vt3;
--                                    string_of_vectype vt4]
--  | Arity4 (vt1, vt2, vt3, vt4, vt5) -> [string_of_vectype vt1;
--                                         string_of_vectype vt2;
--                                         string_of_vectype vt3;
--                                         string_of_vectype vt4;
--                                         string_of_vectype vt5]
--
--(* Suffixes on the end of builtin names that are to be stripped in order
--   to obtain the name used as an instruction.  They are only stripped if
--   preceded immediately by an underscore.  *)
--let suffixes_to_strip = [ "n"; "lane"; "dup" ]
--
--(* Get the possible names of an instruction corresponding to a "name" from the
--   ops table.  This is done by getting the equivalent builtin name and
--   stripping any suffixes from the list at the top of this file, unless
--   the features list presents with an Instruction_name entry, in which
--   case that is used; or unless the features list presents with a Flipped
--   entry, in which case that is used.  If both such entries are present,
--   the first in the list will be chosen.  *)
--let get_insn_names features name =
--  let names = try
--  begin
--    match List.find (fun feature -> match feature with
--                                      Instruction_name _ -> true
--				    | Flipped _ -> true
--				    | _ -> false) features
--    with
--      Instruction_name names -> names
--    | Flipped name -> [name]
--    | _ -> assert false
--  end
--  with Not_found -> [builtin_name features name]
--  in
--  begin
--    List.map (fun name' ->
--      try
--        let underscore = String.rindex name' '_' in
--        let our_suffix = String.sub name' (underscore + 1)
--                                    ((String.length name') - underscore - 1)
--        in
--          let rec strip remaining_suffixes =
--            match remaining_suffixes with
--              [] -> name'
--            | s::ss when our_suffix = s -> String.sub name' 0 underscore
--            | _::ss -> strip ss
--          in
--            strip suffixes_to_strip
--      with (Not_found | Invalid_argument _) -> name') names
--  end
--
--(* Apply a function to each element of a list and then comma-separate
--   the resulting strings.  *)
--let rec commas f elts acc =
--  match elts with
--    [] -> acc
--  | [elt] -> acc ^ (f elt)
--  | elt::elts ->
--    commas f elts (acc ^ (f elt) ^ ", ")
--
--(* Given a list of features and the shape specified in the "ops" table, apply
--   a function to each possible shape that the instruction may have.
--   By default, this is the "shape" entry in "ops".  If the features list
--   contains a Disassembles_as entry, the shapes contained in that entry are
--   mapped to corresponding outputs and returned in a list.  If there is more
--   than one Disassembles_as entry, only the first is used.  *)
--let analyze_all_shapes features shape f =
--  try
--    match List.find (fun feature ->
--                       match feature with Disassembles_as _ -> true
--                                        | _ -> false)
--                    features with
--      Disassembles_as shapes -> List.map f shapes
--    | _ -> assert false
--  with Not_found -> [f shape]
--
--(* The crypto intrinsics have unconventional shapes and are not that
--   numerous to be worth the trouble of encoding here.  We implement them
--   explicitly here.  *)
--let crypto_intrinsics =
--"
--#ifdef __ARM_FEATURE_CRYPTO
--
--__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
--vldrq_p128 (poly128_t const * __ptr)
--{
--#ifdef __ARM_BIG_ENDIAN
--  poly64_t* __ptmp = (poly64_t*) __ptr;
--  poly64_t __d0 = vld1_p64 (__ptmp);
--  poly64_t __d1 = vld1_p64 (__ptmp + 1);
--  return vreinterpretq_p128_p64 (vcombine_p64 (__d1, __d0));
--#else
--  return vreinterpretq_p128_p64 (vld1q_p64 ((poly64_t*) __ptr));
--#endif
--}
--
--__extension__ static __inline void __attribute__ ((__always_inline__))
--vstrq_p128 (poly128_t * __ptr, poly128_t __val)
--{
--#ifdef __ARM_BIG_ENDIAN
--  poly64x2_t __tmp = vreinterpretq_p64_p128 (__val);
--  poly64_t __d0 = vget_high_p64 (__tmp);
--  poly64_t __d1 = vget_low_p64 (__tmp);
--  vst1q_p64 ((poly64_t*) __ptr, vcombine_p64 (__d0, __d1));
--#else
--  vst1q_p64 ((poly64_t*) __ptr, vreinterpretq_p64_p128 (__val));
--#endif
--}
--
--/* The vceq_p64 intrinsic does not map to a single instruction.
--   Instead we emulate it by performing a 32-bit variant of the vceq
--   and applying a pairwise min reduction to the result.
--   vceq_u32 will produce two 32-bit halves, each of which will contain either
--   all ones or all zeros depending on whether the corresponding 32-bit
--   halves of the poly64_t were equal.  The whole poly64_t values are equal
--   if and only if both halves are equal, i.e. vceq_u32 returns all ones.
--   If the result is all zeroes for any half then the whole result is zeroes.
--   This is what the pairwise min reduction achieves.  */
--
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vceq_p64 (poly64x1_t __a, poly64x1_t __b)
--{
--  uint32x2_t __t_a = vreinterpret_u32_p64 (__a);
--  uint32x2_t __t_b = vreinterpret_u32_p64 (__b);
--  uint32x2_t __c = vceq_u32 (__t_a, __t_b);
--  uint32x2_t __m = vpmin_u32 (__c, __c);
--  return vreinterpret_u64_u32 (__m);
--}
--
--/* The vtst_p64 intrinsic does not map to a single instruction.
--   We emulate it in way similar to vceq_p64 above but here we do
--   a reduction with max since if any two corresponding bits
--   in the two poly64_t's match, then the whole result must be all ones.  */
--
--__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
--vtst_p64 (poly64x1_t __a, poly64x1_t __b)
--{
--  uint32x2_t __t_a = vreinterpret_u32_p64 (__a);
--  uint32x2_t __t_b = vreinterpret_u32_p64 (__b);
--  uint32x2_t __c = vtst_u32 (__t_a, __t_b);
--  uint32x2_t __m = vpmax_u32 (__c, __c);
--  return vreinterpret_u64_u32 (__m);
--}
--
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vaeseq_u8 (uint8x16_t __data, uint8x16_t __key)
--{
--  return __builtin_arm_crypto_aese (__data, __key);
--}
--
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vaesdq_u8 (uint8x16_t __data, uint8x16_t __key)
--{
--  return __builtin_arm_crypto_aesd (__data, __key);
--}
--
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vaesmcq_u8 (uint8x16_t __data)
--{
--  return __builtin_arm_crypto_aesmc (__data);
--}
--
--__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
--vaesimcq_u8 (uint8x16_t __data)
--{
--  return __builtin_arm_crypto_aesimc (__data);
--}
--
--__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
--vsha1h_u32 (uint32_t __hash_e)
--{
--  uint32x4_t __t = vdupq_n_u32 (0);
--  __t = vsetq_lane_u32 (__hash_e, __t, 0);
--  __t = __builtin_arm_crypto_sha1h (__t);
--  return vgetq_lane_u32 (__t, 0);
--}
--
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsha1cq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
--{
--  uint32x4_t __t = vdupq_n_u32 (0);
--  __t = vsetq_lane_u32 (__hash_e, __t, 0);
--  return __builtin_arm_crypto_sha1c (__hash_abcd, __t, __wk);
--}
--
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsha1pq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
--{
--  uint32x4_t __t = vdupq_n_u32 (0);
--  __t = vsetq_lane_u32 (__hash_e, __t, 0);
--  return __builtin_arm_crypto_sha1p (__hash_abcd, __t, __wk);
--}
--
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsha1mq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
--{
--  uint32x4_t __t = vdupq_n_u32 (0);
--  __t = vsetq_lane_u32 (__hash_e, __t, 0);
--  return __builtin_arm_crypto_sha1m (__hash_abcd, __t, __wk);
--}
--
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsha1su0q_u32 (uint32x4_t __w0_3, uint32x4_t __w4_7, uint32x4_t __w8_11)
--{
--  return __builtin_arm_crypto_sha1su0 (__w0_3, __w4_7, __w8_11);
--}
--
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsha1su1q_u32 (uint32x4_t __tw0_3, uint32x4_t __w12_15)
--{
--  return __builtin_arm_crypto_sha1su1 (__tw0_3, __w12_15);
--}
--
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsha256hq_u32 (uint32x4_t __hash_abcd, uint32x4_t __hash_efgh, uint32x4_t __wk)
--{
--  return __builtin_arm_crypto_sha256h (__hash_abcd, __hash_efgh, __wk);
--}
--
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsha256h2q_u32 (uint32x4_t __hash_abcd, uint32x4_t __hash_efgh, uint32x4_t __wk)
--{
--  return __builtin_arm_crypto_sha256h2 (__hash_abcd, __hash_efgh, __wk);
--}
--
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsha256su0q_u32 (uint32x4_t __w0_3, uint32x4_t __w4_7)
--{
--  return __builtin_arm_crypto_sha256su0 (__w0_3, __w4_7);
--}
--
--__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
--vsha256su1q_u32 (uint32x4_t __tw0_3, uint32x4_t __w8_11, uint32x4_t __w12_15)
--{
--  return __builtin_arm_crypto_sha256su1 (__tw0_3, __w8_11, __w12_15);
--}
--
--__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
--vmull_p64 (poly64_t __a, poly64_t __b)
--{
--  return (poly128_t) __builtin_arm_crypto_vmullp64 ((uint64_t) __a, (uint64_t) __b);
--}
--
--__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
--vmull_high_p64 (poly64x2_t __a, poly64x2_t __b)
--{
--  poly64_t __t1 = vget_high_p64 (__a);
--  poly64_t __t2 = vget_high_p64 (__b);
--
--  return (poly128_t) __builtin_arm_crypto_vmullp64 ((uint64_t) __t1, (uint64_t) __t2);
--}
--
--#endif
--"
---- a/src/gcc/config/arm/predicates.md
-+++ b/src/gcc/config/arm/predicates.md
-@@ -141,8 +141,7 @@
-        (match_test "const_ok_for_arm (~INTVAL (op))")))
- 
- (define_predicate "const0_operand"
--  (and (match_code "const_int")
--       (match_test "INTVAL (op) == 0")))
-+  (match_test "op == CONST0_RTX (mode)"))
- 
- ;; Something valid on the RHS of an ARM data-processing instruction
- (define_predicate "arm_rhs_operand"
-@@ -170,8 +169,7 @@
- 
- (define_predicate "const_neon_scalar_shift_amount_operand"
-   (and (match_code "const_int")
--       (match_test "((unsigned HOST_WIDE_INT) INTVAL (op)) <= GET_MODE_BITSIZE (mode)
--	&& ((unsigned HOST_WIDE_INT) INTVAL (op)) > 0")))
-+       (match_test "IN_RANGE (UINTVAL (op), 1, GET_MODE_BITSIZE (mode))")))
- 
- (define_predicate "ldrd_strd_offset_operand"
-   (and (match_operand 0 "const_int_operand")
-@@ -243,11 +241,6 @@
-        (and (match_code "const_double")
- 	    (match_test "arm_const_double_rtx (op)"))))
- 
--(define_predicate "arm_float_compare_operand"
--  (if_then_else (match_test "TARGET_VFP")
--		(match_operand 0 "vfp_compare_operand")
--		(match_operand 0 "s_register_operand")))
--
- ;; True for valid index operands.
- (define_predicate "index_operand"
-   (ior (match_operand 0 "s_register_operand")
-@@ -285,19 +278,19 @@
- 		      (match_test "power_of_two_operand (XEXP (op, 1), mode)"))
- 		 (and (match_code "rotate")
- 		      (match_test "CONST_INT_P (XEXP (op, 1))
--				   && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) < 32")))
-+				   && (UINTVAL (XEXP (op, 1))) < 32")))
- 	    (and (match_code "ashift,ashiftrt,lshiftrt,rotatert")
- 		 (match_test "!CONST_INT_P (XEXP (op, 1))
--			      || ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) < 32")))
-+			      || (UINTVAL (XEXP (op, 1))) < 32")))
-        (match_test "mode == GET_MODE (op)")))
- 
- (define_special_predicate "shift_nomul_operator"
-   (and (ior (and (match_code "rotate")
- 		 (match_test "CONST_INT_P (XEXP (op, 1))
--			      && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) < 32"))
-+			      && (UINTVAL (XEXP (op, 1))) < 32"))
- 	    (and (match_code "ashift,ashiftrt,lshiftrt,rotatert")
- 		 (match_test "!CONST_INT_P (XEXP (op, 1))
--			      || ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) < 32")))
-+			      || (UINTVAL (XEXP (op, 1))) < 32")))
-        (match_test "mode == GET_MODE (op)")))
- 
- ;; True for shift operators which can be used with saturation instructions.
-@@ -306,7 +299,7 @@
-                  (match_test "power_of_two_operand (XEXP (op, 1), mode)"))
-             (and (match_code "ashift,ashiftrt")
-                  (match_test "CONST_INT_P (XEXP (op, 1))
--		              && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1)) < 32)")))
-+		              && (UINTVAL (XEXP (op, 1)) < 32)")))
-        (match_test "mode == GET_MODE (op)")))
- 
- ;; True for MULT, to identify which variant of shift_operator is in use.
-@@ -398,6 +391,12 @@
- 	     || mode == CC_DGTUmode));
- })
- 
-+;; Any register, including CC
-+(define_predicate "cc_register_operand"
-+  (and (match_code "reg")
-+       (ior (match_operand 0 "s_register_operand")
-+	    (match_operand 0 "cc_register"))))
-+
- (define_special_predicate "arm_extendqisi_mem_op"
-   (and (match_operand 0 "memory_operand")
-        (match_test "TARGET_ARM ? arm_legitimate_address_outer_p (mode,
-@@ -532,7 +531,7 @@
-   (ior (and (match_code "reg,subreg")
- 	    (match_operand 0 "s_register_operand"))
-        (and (match_code "const_int")
--	    (match_test "((unsigned HOST_WIDE_INT) INTVAL (op)) < 256"))))
-+	    (match_test "(UINTVAL (op)) < 256"))))
- 
- (define_predicate "thumb1_cmpneg_operand"
-   (and (match_code "const_int")
-@@ -612,69 +611,23 @@
- (define_special_predicate "vect_par_constant_high" 
-   (match_code "parallel")
- {
--  HOST_WIDE_INT count = XVECLEN (op, 0);
--  int i;
--  int base = GET_MODE_NUNITS (mode);
--
--  if ((count < 1)
--      || (count != base/2))
--    return false;
--    
--  if (!VECTOR_MODE_P (mode))
--    return false;
--
--  for (i = 0; i < count; i++)
--   {
--     rtx elt = XVECEXP (op, 0, i);
--     int val;
--
--     if (!CONST_INT_P (elt))
--       return false;
--
--     val = INTVAL (elt);
--     if (val != (base/2) + i)
--       return false;
--   }
--  return true; 
-+  return arm_simd_check_vect_par_cnst_half_p (op, mode, true);
- })
- 
- (define_special_predicate "vect_par_constant_low"
-   (match_code "parallel")
- {
--  HOST_WIDE_INT count = XVECLEN (op, 0);
--  int i;
--  int base = GET_MODE_NUNITS (mode);
--
--  if ((count < 1)
--      || (count != base/2))
--    return false;
--    
--  if (!VECTOR_MODE_P (mode))
--    return false;
--
--  for (i = 0; i < count; i++)
--   {
--     rtx elt = XVECEXP (op, 0, i);
--     int val;
--
--     if (!CONST_INT_P (elt))
--       return false;
--
--     val = INTVAL (elt);
--     if (val != i)
--       return false;
--   } 
--  return true; 
-+  return arm_simd_check_vect_par_cnst_half_p (op, mode, false);
- })
- 
- (define_predicate "const_double_vcvt_power_of_two_reciprocal"
-   (and (match_code "const_double")
--       (match_test "TARGET_32BIT && TARGET_VFP
--                   && vfp3_const_double_for_fract_bits (op)")))
-+       (match_test "TARGET_32BIT
-+		    && vfp3_const_double_for_fract_bits (op)")))
- 
- (define_predicate "const_double_vcvt_power_of_two"
-   (and (match_code "const_double")
--       (match_test "TARGET_32BIT && TARGET_VFP
-+       (match_test "TARGET_32BIT
- 		    && vfp3_const_double_for_bits (op) > 0")))
- 
- (define_predicate "neon_struct_operand"
---- a/src/gcc/config/arm/sync.md
-+++ b/src/gcc/config/arm/sync.md
-@@ -63,37 +63,59 @@
-    (set_attr "predicable" "no")])
- 
- (define_insn "atomic_load<mode>"
--  [(set (match_operand:QHSI 0 "register_operand" "=r")
-+  [(set (match_operand:QHSI 0 "register_operand" "=r,r,l")
-     (unspec_volatile:QHSI
--      [(match_operand:QHSI 1 "arm_sync_memory_operand" "Q")
--       (match_operand:SI 2 "const_int_operand")]		;; model
-+      [(match_operand:QHSI 1 "arm_sync_memory_operand" "Q,Q,Q")
-+       (match_operand:SI 2 "const_int_operand" "n,Pf,n")]	;; model
-       VUNSPEC_LDA))]
-   "TARGET_HAVE_LDACQ"
-   {
-     enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
-     if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release (model))
--      return \"ldr<sync_sfx>%?\\t%0, %1\";
-+      {
-+	if (TARGET_THUMB1)
-+	  return \"ldr<sync_sfx>\\t%0, %1\";
-+	else
-+	  return \"ldr<sync_sfx>%?\\t%0, %1\";
-+      }
-     else
--      return \"lda<sync_sfx>%?\\t%0, %1\";
-+      {
-+	if (TARGET_THUMB1)
-+	  return \"lda<sync_sfx>\\t%0, %1\";
-+	else
-+	  return \"lda<sync_sfx>%?\\t%0, %1\";
-+      }
-   }
--  [(set_attr "predicable" "yes")
-+  [(set_attr "arch" "32,v8mb,any")
-+   (set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")])
- 
- (define_insn "atomic_store<mode>"
--  [(set (match_operand:QHSI 0 "memory_operand" "=Q")
-+  [(set (match_operand:QHSI 0 "memory_operand" "=Q,Q,Q")
-     (unspec_volatile:QHSI
--      [(match_operand:QHSI 1 "general_operand" "r")
--       (match_operand:SI 2 "const_int_operand")]		;; model
-+      [(match_operand:QHSI 1 "general_operand" "r,r,l")
-+       (match_operand:SI 2 "const_int_operand" "n,Pf,n")]	;; model
-       VUNSPEC_STL))]
-   "TARGET_HAVE_LDACQ"
-   {
-     enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
-     if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
--      return \"str<sync_sfx>%?\t%1, %0\";
-+      {
-+	if (TARGET_THUMB1)
-+	  return \"str<sync_sfx>\t%1, %0\";
-+	else
-+	  return \"str<sync_sfx>%?\t%1, %0\";
-+      }
-     else
--      return \"stl<sync_sfx>%?\t%1, %0\";
-+      {
-+	if (TARGET_THUMB1)
-+	  return \"stl<sync_sfx>\t%1, %0\";
-+	else
-+	  return \"stl<sync_sfx>%?\t%1, %0\";
-+      }
-   }
--  [(set_attr "predicable" "yes")
-+  [(set_attr "arch" "32,v8mb,any")
-+   (set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")])
- 
- ;; An LDRD instruction usable by the atomic_loaddi expander on LPAE targets
-@@ -117,7 +139,7 @@
-   [(match_operand:DI 0 "s_register_operand")		;; val out
-    (match_operand:DI 1 "mem_noofs_operand")		;; memory
-    (match_operand:SI 2 "const_int_operand")]		;; model
--  "(TARGET_HAVE_LDREXD || TARGET_HAVE_LPAE || TARGET_HAVE_LDACQ)
-+  "(TARGET_HAVE_LDREXD || TARGET_HAVE_LPAE || TARGET_HAVE_LDACQEXD)
-    && ARM_DOUBLEWORD_ALIGN"
- {
-   memmodel model = memmodel_from_int (INTVAL (operands[2]));
-@@ -125,7 +147,7 @@
-   /* For ARMv8-A we can use an LDAEXD to atomically load two 32-bit registers
-      when acquire or stronger semantics are needed.  When the relaxed model is
-      used this can be relaxed to a normal LDRD.  */
--  if (TARGET_HAVE_LDACQ)
-+  if (TARGET_HAVE_LDACQEXD)
-     {
-       if (is_mm_relaxed (model))
- 	emit_insn (gen_arm_atomic_loaddi2_ldrd (operands[0], operands[1]));
-@@ -167,21 +189,23 @@
-   DONE;
- })
- 
-+;; Constraints of this pattern must be at least as strict as those of the
-+;; cbranchsi operations in thumb1.md and aim to be as permissive.
- (define_insn_and_split "atomic_compare_and_swap<mode>_1"
--  [(set (reg:CC_Z CC_REGNUM)					;; bool out
-+  [(set (match_operand 0 "cc_register_operand" "=&c,&l,&l,&l")		;; bool out
- 	(unspec_volatile:CC_Z [(const_int 0)] VUNSPEC_ATOMIC_CAS))
--   (set (match_operand:SI 0 "s_register_operand" "=&r")		;; val out
-+   (set (match_operand:SI 1 "s_register_operand" "=&r,&l,&0,&l*h")	;; val out
- 	(zero_extend:SI
--	  (match_operand:NARROW 1 "mem_noofs_operand" "+Ua")))	;; memory
--   (set (match_dup 1)
-+	  (match_operand:NARROW 2 "mem_noofs_operand" "+Ua,Ua,Ua,Ua")))	;; memory
-+   (set (match_dup 2)
- 	(unspec_volatile:NARROW
--	  [(match_operand:SI 2 "arm_add_operand" "rIL")		;; expected
--	   (match_operand:NARROW 3 "s_register_operand" "r")	;; desired
--	   (match_operand:SI 4 "const_int_operand")		;; is_weak
--	   (match_operand:SI 5 "const_int_operand")		;; mod_s
--	   (match_operand:SI 6 "const_int_operand")]		;; mod_f
-+	  [(match_operand:SI 3 "arm_add_operand" "rIL,lIL*h,J,*r")	;; expected
-+	   (match_operand:NARROW 4 "s_register_operand" "r,r,r,r")	;; desired
-+	   (match_operand:SI 5 "const_int_operand")		;; is_weak
-+	   (match_operand:SI 6 "const_int_operand")		;; mod_s
-+	   (match_operand:SI 7 "const_int_operand")]		;; mod_f
- 	  VUNSPEC_ATOMIC_CAS))
--   (clobber (match_scratch:SI 7 "=&r"))]
-+   (clobber (match_scratch:SI 8 "=&r,X,X,X"))]
-   "<sync_predtab>"
-   "#"
-   "&& reload_completed"
-@@ -189,27 +213,30 @@
-   {
-     arm_split_compare_and_swap (operands);
-     DONE;
--  })
-+  }
-+  [(set_attr "arch" "32,v8mb,v8mb,v8mb")])
- 
- (define_mode_attr cas_cmp_operand
-   [(SI "arm_add_operand") (DI "cmpdi_operand")])
- (define_mode_attr cas_cmp_str
-   [(SI "rIL") (DI "rDi")])
- 
-+;; Constraints of this pattern must be at least as strict as those of the
-+;; cbranchsi operations in thumb1.md and aim to be as permissive.
- (define_insn_and_split "atomic_compare_and_swap<mode>_1"
--  [(set (reg:CC_Z CC_REGNUM)					;; bool out
-+  [(set (match_operand 0 "cc_register_operand" "=&c,&l,&l,&l")		;; bool out
- 	(unspec_volatile:CC_Z [(const_int 0)] VUNSPEC_ATOMIC_CAS))
--   (set (match_operand:SIDI 0 "s_register_operand" "=&r")	;; val out
--	(match_operand:SIDI 1 "mem_noofs_operand" "+Ua"))	;; memory
--   (set (match_dup 1)
-+   (set (match_operand:SIDI 1 "s_register_operand" "=&r,&l,&0,&l*h")	;; val out
-+	(match_operand:SIDI 2 "mem_noofs_operand" "+Ua,Ua,Ua,Ua"))	;; memory
-+   (set (match_dup 2)
- 	(unspec_volatile:SIDI
--	  [(match_operand:SIDI 2 "<cas_cmp_operand>" "<cas_cmp_str>") ;; expect
--	   (match_operand:SIDI 3 "s_register_operand" "r")	;; desired
--	   (match_operand:SI 4 "const_int_operand")		;; is_weak
--	   (match_operand:SI 5 "const_int_operand")		;; mod_s
--	   (match_operand:SI 6 "const_int_operand")]		;; mod_f
-+	  [(match_operand:SIDI 3 "<cas_cmp_operand>" "<cas_cmp_str>,lIL*h,J,*r") ;; expect
-+	   (match_operand:SIDI 4 "s_register_operand" "r,r,r,r")	;; desired
-+	   (match_operand:SI 5 "const_int_operand")		;; is_weak
-+	   (match_operand:SI 6 "const_int_operand")		;; mod_s
-+	   (match_operand:SI 7 "const_int_operand")]		;; mod_f
- 	  VUNSPEC_ATOMIC_CAS))
--   (clobber (match_scratch:SI 7 "=&r"))]
-+   (clobber (match_scratch:SI 8 "=&r,X,X,X"))]
-   "<sync_predtab>"
-   "#"
-   "&& reload_completed"
-@@ -217,18 +244,19 @@
-   {
-     arm_split_compare_and_swap (operands);
-     DONE;
--  })
-+  }
-+  [(set_attr "arch" "32,v8mb,v8mb,v8mb")])
- 
- (define_insn_and_split "atomic_exchange<mode>"
--  [(set (match_operand:QHSD 0 "s_register_operand" "=&r")	;; output
--	(match_operand:QHSD 1 "mem_noofs_operand" "+Ua"))	;; memory
-+  [(set (match_operand:QHSD 0 "s_register_operand" "=&r,&r")	;; output
-+	(match_operand:QHSD 1 "mem_noofs_operand" "+Ua,Ua"))	;; memory
-    (set (match_dup 1)
- 	(unspec_volatile:QHSD
--	  [(match_operand:QHSD 2 "s_register_operand" "r")	;; input
-+	  [(match_operand:QHSD 2 "s_register_operand" "r,r")	;; input
- 	   (match_operand:SI 3 "const_int_operand" "")]		;; model
- 	  VUNSPEC_ATOMIC_XCHG))
-    (clobber (reg:CC CC_REGNUM))
--   (clobber (match_scratch:SI 4 "=&r"))]
-+   (clobber (match_scratch:SI 4 "=&r,&l"))]
-   "<sync_predtab>"
-   "#"
-   "&& reload_completed"
-@@ -237,7 +265,11 @@
-     arm_split_atomic_op (SET, operands[0], NULL, operands[1],
- 			 operands[2], operands[3], operands[4]);
-     DONE;
--  })
-+  }
-+  [(set_attr "arch" "32,v8mb")])
-+
-+;; The following mode and code attribute are defined here because they are
-+;; specific to atomics and are not needed anywhere else.
- 
- (define_mode_attr atomic_op_operand
-   [(QI "reg_or_int_operand")
-@@ -248,16 +280,24 @@
- (define_mode_attr atomic_op_str
-   [(QI "rn") (HI "rn") (SI "rn") (DI "r")])
- 
-+(define_code_attr thumb1_atomic_op_str
-+  [(ior "l,l") (xor "l,l") (and "l,l") (plus "lIJL,r") (minus "lPd,lPd")])
-+
-+(define_code_attr thumb1_atomic_newop_str
-+  [(ior "&l,&l") (xor "&l,&l") (and "&l,&l") (plus "&l,&r") (minus "&l,&l")])
-+
-+;; Constraints of this pattern must be at least as strict as those of the non
-+;; atomic operations in thumb1.md and aim to be as permissive.
- (define_insn_and_split "atomic_<sync_optab><mode>"
--  [(set (match_operand:QHSD 0 "mem_noofs_operand" "+Ua")
-+  [(set (match_operand:QHSD 0 "mem_noofs_operand" "+Ua,Ua,Ua")
- 	(unspec_volatile:QHSD
- 	  [(syncop:QHSD (match_dup 0)
--	     (match_operand:QHSD 1 "<atomic_op_operand>" "<atomic_op_str>"))
-+	     (match_operand:QHSD 1 "<atomic_op_operand>" "<atomic_op_str>,<thumb1_atomic_op_str>"))
- 	   (match_operand:SI 2 "const_int_operand")]		;; model
- 	  VUNSPEC_ATOMIC_OP))
-    (clobber (reg:CC CC_REGNUM))
--   (clobber (match_scratch:QHSD 3 "=&r"))
--   (clobber (match_scratch:SI 4 "=&r"))]
-+   (clobber (match_scratch:QHSD 3 "=&r,<thumb1_atomic_newop_str>"))
-+   (clobber (match_scratch:SI 4 "=&r,&l,&l"))]
-   "<sync_predtab>"
-   "#"
-   "&& reload_completed"
-@@ -266,19 +306,22 @@
-     arm_split_atomic_op (<CODE>, NULL, operands[3], operands[0],
- 			 operands[1], operands[2], operands[4]);
-     DONE;
--  })
-+  }
-+  [(set_attr "arch" "32,v8mb,v8mb")])
- 
-+;; Constraints of this pattern must be at least as strict as those of the non
-+;; atomic NANDs in thumb1.md and aim to be as permissive.
- (define_insn_and_split "atomic_nand<mode>"
--  [(set (match_operand:QHSD 0 "mem_noofs_operand" "+Ua")
-+  [(set (match_operand:QHSD 0 "mem_noofs_operand" "+Ua,Ua")
- 	(unspec_volatile:QHSD
- 	  [(not:QHSD
- 	     (and:QHSD (match_dup 0)
--	       (match_operand:QHSD 1 "<atomic_op_operand>" "<atomic_op_str>")))
-+	       (match_operand:QHSD 1 "<atomic_op_operand>" "<atomic_op_str>,l")))
- 	   (match_operand:SI 2 "const_int_operand")]		;; model
- 	  VUNSPEC_ATOMIC_OP))
-    (clobber (reg:CC CC_REGNUM))
--   (clobber (match_scratch:QHSD 3 "=&r"))
--   (clobber (match_scratch:SI 4 "=&r"))]
-+   (clobber (match_scratch:QHSD 3 "=&r,&l"))
-+   (clobber (match_scratch:SI 4 "=&r,&l"))]
-   "<sync_predtab>"
-   "#"
-   "&& reload_completed"
-@@ -287,20 +330,38 @@
-     arm_split_atomic_op (NOT, NULL, operands[3], operands[0],
- 			 operands[1], operands[2], operands[4]);
-     DONE;
--  })
-+  }
-+  [(set_attr "arch" "32,v8mb")])
-+
-+;; 3 alternatives are needed to represent constraints after split from
-+;; thumb1_addsi3: (i) case where operand1 and destination can be in different
-+;; registers, (ii) case where they are in the same low register and (iii) case
-+;; when they are in the same register without restriction on the register.  We
-+;; disparage slightly alternatives that require copying the old value into the
-+;; register for the new value (see bind_old_new in arm_split_atomic_op).
-+(define_code_attr thumb1_atomic_fetch_op_str
-+  [(ior "l,l,l") (xor "l,l,l") (and "l,l,l") (plus "lL,?IJ,?r") (minus "lPd,lPd,lPd")])
-+
-+(define_code_attr thumb1_atomic_fetch_newop_str
-+  [(ior "&l,&l,&l") (xor "&l,&l,&l") (and "&l,&l,&l") (plus "&l,&l,&r") (minus "&l,&l,&l")])
- 
-+(define_code_attr thumb1_atomic_fetch_oldop_str
-+  [(ior "&r,&r,&r") (xor "&r,&r,&r") (and "&r,&r,&r") (plus "&l,&r,&r") (minus "&l,&l,&l")])
-+
-+;; Constraints of this pattern must be at least as strict as those of the non
-+;; atomic operations in thumb1.md and aim to be as permissive.
- (define_insn_and_split "atomic_fetch_<sync_optab><mode>"
--  [(set (match_operand:QHSD 0 "s_register_operand" "=&r")
--	(match_operand:QHSD 1 "mem_noofs_operand" "+Ua"))
-+  [(set (match_operand:QHSD 0 "s_register_operand" "=&r,<thumb1_atomic_fetch_oldop_str>")
-+	(match_operand:QHSD 1 "mem_noofs_operand" "+Ua,Ua,Ua,Ua"))
-    (set (match_dup 1)
- 	(unspec_volatile:QHSD
- 	  [(syncop:QHSD (match_dup 1)
--	     (match_operand:QHSD 2 "<atomic_op_operand>" "<atomic_op_str>"))
-+	     (match_operand:QHSD 2 "<atomic_op_operand>" "<atomic_op_str>,<thumb1_atomic_fetch_op_str>"))
- 	   (match_operand:SI 3 "const_int_operand")]		;; model
- 	  VUNSPEC_ATOMIC_OP))
-    (clobber (reg:CC CC_REGNUM))
--   (clobber (match_scratch:QHSD 4 "=&r"))
--   (clobber (match_scratch:SI 5 "=&r"))]
-+   (clobber (match_scratch:QHSD 4 "=&r,<thumb1_atomic_fetch_newop_str>"))
-+   (clobber (match_scratch:SI 5 "=&r,&l,&l,&l"))]
-   "<sync_predtab>"
-   "#"
-   "&& reload_completed"
-@@ -309,21 +370,24 @@
-     arm_split_atomic_op (<CODE>, operands[0], operands[4], operands[1],
- 			 operands[2], operands[3], operands[5]);
-     DONE;
--  })
-+  }
-+  [(set_attr "arch" "32,v8mb,v8mb,v8mb")])
- 
-+;; Constraints of this pattern must be at least as strict as those of the non
-+;; atomic NANDs in thumb1.md and aim to be as permissive.
- (define_insn_and_split "atomic_fetch_nand<mode>"
--  [(set (match_operand:QHSD 0 "s_register_operand" "=&r")
--	(match_operand:QHSD 1 "mem_noofs_operand" "+Ua"))
-+  [(set (match_operand:QHSD 0 "s_register_operand" "=&r,&r")
-+	(match_operand:QHSD 1 "mem_noofs_operand" "+Ua,Ua"))
-    (set (match_dup 1)
- 	(unspec_volatile:QHSD
- 	  [(not:QHSD
- 	     (and:QHSD (match_dup 1)
--	       (match_operand:QHSD 2 "<atomic_op_operand>" "<atomic_op_str>")))
-+	       (match_operand:QHSD 2 "<atomic_op_operand>" "<atomic_op_str>,l")))
- 	   (match_operand:SI 3 "const_int_operand")]		;; model
- 	  VUNSPEC_ATOMIC_OP))
-    (clobber (reg:CC CC_REGNUM))
--   (clobber (match_scratch:QHSD 4 "=&r"))
--   (clobber (match_scratch:SI 5 "=&r"))]
-+   (clobber (match_scratch:QHSD 4 "=&r,&l"))
-+   (clobber (match_scratch:SI 5 "=&r,&l"))]
-   "<sync_predtab>"
-   "#"
-   "&& reload_completed"
-@@ -332,20 +396,23 @@
-     arm_split_atomic_op (NOT, operands[0], operands[4], operands[1],
- 			 operands[2], operands[3], operands[5]);
-     DONE;
--  })
-+  }
-+  [(set_attr "arch" "32,v8mb")])
- 
-+;; Constraints of this pattern must be at least as strict as those of the non
-+;; atomic operations in thumb1.md and aim to be as permissive.
- (define_insn_and_split "atomic_<sync_optab>_fetch<mode>"
--  [(set (match_operand:QHSD 0 "s_register_operand" "=&r")
-+  [(set (match_operand:QHSD 0 "s_register_operand" "=&r,<thumb1_atomic_newop_str>")
- 	(syncop:QHSD
--	  (match_operand:QHSD 1 "mem_noofs_operand" "+Ua")
--	  (match_operand:QHSD 2 "<atomic_op_operand>" "<atomic_op_str>")))
-+	  (match_operand:QHSD 1 "mem_noofs_operand" "+Ua,Ua,Ua")
-+	  (match_operand:QHSD 2 "<atomic_op_operand>" "<atomic_op_str>,<thumb1_atomic_op_str>")))
-    (set (match_dup 1)
- 	(unspec_volatile:QHSD
- 	  [(match_dup 1) (match_dup 2)
- 	   (match_operand:SI 3 "const_int_operand")]		;; model
- 	  VUNSPEC_ATOMIC_OP))
-    (clobber (reg:CC CC_REGNUM))
--   (clobber (match_scratch:SI 4 "=&r"))]
-+   (clobber (match_scratch:SI 4 "=&r,&l,&l"))]
-   "<sync_predtab>"
-   "#"
-   "&& reload_completed"
-@@ -354,21 +421,24 @@
-     arm_split_atomic_op (<CODE>, NULL, operands[0], operands[1],
- 			 operands[2], operands[3], operands[4]);
-     DONE;
--  })
-+  }
-+  [(set_attr "arch" "32,v8mb,v8mb")])
- 
-+;; Constraints of this pattern must be at least as strict as those of the non
-+;; atomic NANDs in thumb1.md and aim to be as permissive.
- (define_insn_and_split "atomic_nand_fetch<mode>"
--  [(set (match_operand:QHSD 0 "s_register_operand" "=&r")
-+  [(set (match_operand:QHSD 0 "s_register_operand" "=&r,&l")
- 	(not:QHSD
- 	  (and:QHSD
--	    (match_operand:QHSD 1 "mem_noofs_operand" "+Ua")
--	    (match_operand:QHSD 2 "<atomic_op_operand>" "<atomic_op_str>"))))
-+	    (match_operand:QHSD 1 "mem_noofs_operand" "+Ua,Ua")
-+	    (match_operand:QHSD 2 "<atomic_op_operand>" "<atomic_op_str>,l"))))
-    (set (match_dup 1)
- 	(unspec_volatile:QHSD
- 	  [(match_dup 1) (match_dup 2)
- 	   (match_operand:SI 3 "const_int_operand")]		;; model
- 	  VUNSPEC_ATOMIC_OP))
-    (clobber (reg:CC CC_REGNUM))
--   (clobber (match_scratch:SI 4 "=&r"))]
-+   (clobber (match_scratch:SI 4 "=&r,&l"))]
-   "<sync_predtab>"
-   "#"
-   "&& reload_completed"
-@@ -377,48 +447,61 @@
-     arm_split_atomic_op (NOT, NULL, operands[0], operands[1],
- 			 operands[2], operands[3], operands[4]);
-     DONE;
--  })
-+  }
-+  [(set_attr "arch" "32,v8mb")])
- 
- (define_insn "arm_load_exclusive<mode>"
--  [(set (match_operand:SI 0 "s_register_operand" "=r")
-+  [(set (match_operand:SI 0 "s_register_operand" "=r,r")
-         (zero_extend:SI
- 	  (unspec_volatile:NARROW
--	    [(match_operand:NARROW 1 "mem_noofs_operand" "Ua")]
-+	    [(match_operand:NARROW 1 "mem_noofs_operand" "Ua,Ua")]
- 	    VUNSPEC_LL)))]
-   "TARGET_HAVE_LDREXBH"
--  "ldrex<sync_sfx>%?\t%0, %C1"
--  [(set_attr "predicable" "yes")
-+  "@
-+   ldrex<sync_sfx>%?\t%0, %C1
-+   ldrex<sync_sfx>\t%0, %C1"
-+  [(set_attr "arch" "32,v8mb")
-+   (set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")])
- 
- (define_insn "arm_load_acquire_exclusive<mode>"
--  [(set (match_operand:SI 0 "s_register_operand" "=r")
-+  [(set (match_operand:SI 0 "s_register_operand" "=r,r")
-         (zero_extend:SI
- 	  (unspec_volatile:NARROW
--	    [(match_operand:NARROW 1 "mem_noofs_operand" "Ua")]
-+	    [(match_operand:NARROW 1 "mem_noofs_operand" "Ua,Ua")]
- 	    VUNSPEC_LAX)))]
-   "TARGET_HAVE_LDACQ"
--  "ldaex<sync_sfx>%?\\t%0, %C1"
--  [(set_attr "predicable" "yes")
-+  "@
-+   ldaex<sync_sfx>%?\\t%0, %C1
-+   ldaex<sync_sfx>\\t%0, %C1"
-+  [(set_attr "arch" "32,v8mb")
-+   (set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")])
- 
- (define_insn "arm_load_exclusivesi"
--  [(set (match_operand:SI 0 "s_register_operand" "=r")
-+  [(set (match_operand:SI 0 "s_register_operand" "=r,r")
- 	(unspec_volatile:SI
--	  [(match_operand:SI 1 "mem_noofs_operand" "Ua")]
-+	  [(match_operand:SI 1 "mem_noofs_operand" "Ua,Ua")]
- 	  VUNSPEC_LL))]
-   "TARGET_HAVE_LDREX"
--  "ldrex%?\t%0, %C1"
--  [(set_attr "predicable" "yes")
-+  "@
-+   ldrex%?\t%0, %C1
-+   ldrex\t%0, %C1"
-+  [(set_attr "arch" "32,v8mb")
-+   (set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")])
- 
- (define_insn "arm_load_acquire_exclusivesi"
--  [(set (match_operand:SI 0 "s_register_operand" "=r")
-+  [(set (match_operand:SI 0 "s_register_operand" "=r,r")
- 	(unspec_volatile:SI
--	  [(match_operand:SI 1 "mem_noofs_operand" "Ua")]
-+	  [(match_operand:SI 1 "mem_noofs_operand" "Ua,Ua")]
- 	  VUNSPEC_LAX))]
-   "TARGET_HAVE_LDACQ"
--  "ldaex%?\t%0, %C1"
--  [(set_attr "predicable" "yes")
-+  "@
-+   ldaex%?\t%0, %C1
-+   ldaex\t%0, %C1"
-+  [(set_attr "arch" "32,v8mb")
-+   (set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")])
- 
- (define_insn "arm_load_exclusivedi"
-@@ -436,7 +519,7 @@
- 	(unspec_volatile:DI
- 	  [(match_operand:DI 1 "mem_noofs_operand" "Ua")]
- 	  VUNSPEC_LAX))]
--  "TARGET_HAVE_LDACQ && ARM_DOUBLEWORD_ALIGN"
-+  "TARGET_HAVE_LDACQEXD && ARM_DOUBLEWORD_ALIGN"
-   "ldaexd%?\t%0, %H0, %C1"
-   [(set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")])
-@@ -452,16 +535,18 @@
-   {
-     if (<MODE>mode == DImode)
-       {
--	rtx value = operands[2];
- 	/* The restrictions on target registers in ARM mode are that the two
- 	   registers are consecutive and the first one is even; Thumb is
- 	   actually more flexible, but DI should give us this anyway.
--	   Note that the 1st register always gets the lowest word in memory.  */
--	gcc_assert ((REGNO (value) & 1) == 0 || TARGET_THUMB2);
--	operands[3] = gen_rtx_REG (SImode, REGNO (value) + 1);
--	return "strexd%?\t%0, %2, %3, %C1";
-+	   Note that the 1st register always gets the
-+	   lowest word in memory.  */
-+	gcc_assert ((REGNO (operands[2]) & 1) == 0 || TARGET_THUMB2);
-+	return "strexd%?\t%0, %2, %H2, %C1";
-       }
--    return "strex<sync_sfx>%?\t%0, %2, %C1";
-+    if (TARGET_THUMB1)
-+      return "strex<sync_sfx>\t%0, %2, %C1";
-+    else
-+      return "strex<sync_sfx>%?\t%0, %2, %C1";
-   }
-   [(set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")])
-@@ -473,25 +558,26 @@
- 	(unspec_volatile:DI
- 	  [(match_operand:DI 2 "s_register_operand" "r")]
- 	  VUNSPEC_SLX))]
--  "TARGET_HAVE_LDACQ && ARM_DOUBLEWORD_ALIGN"
-+  "TARGET_HAVE_LDACQEXD && ARM_DOUBLEWORD_ALIGN"
-   {
--    rtx value = operands[2];
-     /* See comment in arm_store_exclusive<mode> above.  */
--    gcc_assert ((REGNO (value) & 1) == 0 || TARGET_THUMB2);
--    operands[3] = gen_rtx_REG (SImode, REGNO (value) + 1);
--    return "stlexd%?\t%0, %2, %3, %C1";
-+    gcc_assert ((REGNO (operands[2]) & 1) == 0 || TARGET_THUMB2);
-+    return "stlexd%?\t%0, %2, %H2, %C1";
-   }
-   [(set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")])
- 
- (define_insn "arm_store_release_exclusive<mode>"
--  [(set (match_operand:SI 0 "s_register_operand" "=&r")
-+  [(set (match_operand:SI 0 "s_register_operand" "=&r,&r")
- 	(unspec_volatile:SI [(const_int 0)] VUNSPEC_SLX))
--   (set (match_operand:QHSI 1 "mem_noofs_operand" "=Ua")
-+   (set (match_operand:QHSI 1 "mem_noofs_operand" "=Ua,Ua")
- 	(unspec_volatile:QHSI
--	  [(match_operand:QHSI 2 "s_register_operand" "r")]
-+	  [(match_operand:QHSI 2 "s_register_operand" "r,r")]
- 	  VUNSPEC_SLX))]
-   "TARGET_HAVE_LDACQ"
--  "stlex<sync_sfx>%?\t%0, %2, %C1"
--  [(set_attr "predicable" "yes")
-+  "@
-+   stlex<sync_sfx>%?\t%0, %2, %C1
-+   stlex<sync_sfx>\t%0, %2, %C1"
-+  [(set_attr "arch" "32,v8mb")
-+   (set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")])
---- a/src/gcc/config/arm/t-aprofile
-+++ b/src/gcc/config/arm/t-aprofile
-@@ -49,38 +49,33 @@ MULTILIB_DIRNAMES      += fpv3 simdv1 fpv4 simdvfpv4 simdv8
- MULTILIB_OPTIONS       += mfloat-abi=softfp/mfloat-abi=hard
- MULTILIB_DIRNAMES      += softfp hard
- 
--# We don't build no-float libraries with an FPU.
--MULTILIB_EXCEPTIONS    += *mfpu=vfpv3-d16
--MULTILIB_EXCEPTIONS    += *mfpu=neon
--MULTILIB_EXCEPTIONS    += *mfpu=vfpv4-d16
--MULTILIB_EXCEPTIONS    += *mfpu=neon-vfpv4
--MULTILIB_EXCEPTIONS    += *mfpu=neon-fp-armv8
--
--# We don't build libraries requiring an FPU at the CPU/Arch/ISA level.
--MULTILIB_EXCEPTIONS    += mfloat-abi=*
--MULTILIB_EXCEPTIONS    += mfpu=*
--MULTILIB_EXCEPTIONS    += mthumb/mfloat-abi=*
--MULTILIB_EXCEPTIONS    += mthumb/mfpu=*
--MULTILIB_EXCEPTIONS    += *march=armv7-a/mfloat-abi=*
--MULTILIB_EXCEPTIONS    += *march=armv7ve/mfloat-abi=*
--MULTILIB_EXCEPTIONS    += *march=armv8-a/mfloat-abi=*
--
--# Ensure the correct FPU variants apply to the correct base architectures.
--MULTILIB_EXCEPTIONS    += *march=armv7ve/*mfpu=vfpv3-d16*
--MULTILIB_EXCEPTIONS    += *march=armv7ve/*mfpu=neon/*
--MULTILIB_EXCEPTIONS    += *march=armv8-a/*mfpu=vfpv3-d16*
--MULTILIB_EXCEPTIONS    += *march=armv8-a/*mfpu=neon/*
--MULTILIB_EXCEPTIONS    += *march=armv7-a/*mfpu=vfpv4-d16*
--MULTILIB_EXCEPTIONS    += *march=armv7-a/*mfpu=neon-vfpv4*
--MULTILIB_EXCEPTIONS    += *march=armv8-a/*mfpu=vfpv4-d16*
--MULTILIB_EXCEPTIONS    += *march=armv8-a/*mfpu=neon-vfpv4*
--MULTILIB_EXCEPTIONS    += *march=armv7-a/*mfpu=neon-fp-armv8*
--MULTILIB_EXCEPTIONS    += *march=armv7ve/*mfpu=neon-fp-armv8*
-+
-+# Option combinations to build library with
-+
-+# Default CPU/Arch (ARM is implicitly included because it uses the default
-+# multilib)
-+MULTILIB_REQUIRED      += mthumb
-+
-+# ARMv7-A
-+MULTILIB_REQUIRED      += *march=armv7-a
-+MULTILIB_REQUIRED      += *march=armv7-a/mfpu=vfpv3-d16/mfloat-abi=*
-+MULTILIB_REQUIRED      += *march=armv7-a/mfpu=neon/mfloat-abi=*
-+
-+# ARMv7VE
-+MULTILIB_REQUIRED      += *march=armv7ve
-+MULTILIB_REQUIRED      += *march=armv7ve/mfpu=vfpv4-d16/mfloat-abi=*
-+MULTILIB_REQUIRED      += *march=armv7ve/mfpu=neon-vfpv4/mfloat-abi=*
-+
-+# ARMv8-A
-+MULTILIB_REQUIRED      += *march=armv8-a
-+MULTILIB_REQUIRED      += *march=armv8-a/mfpu=neon-fp-armv8/mfloat-abi=*
-+
- 
- # CPU Matches
- MULTILIB_MATCHES       += march?armv7-a=mcpu?cortex-a8
- MULTILIB_MATCHES       += march?armv7-a=mcpu?cortex-a9
- MULTILIB_MATCHES       += march?armv7-a=mcpu?cortex-a5
-+MULTILIB_MATCHES       += march?armv7ve=mcpu?cortex-a7
- MULTILIB_MATCHES       += march?armv7ve=mcpu?cortex-a15
- MULTILIB_MATCHES       += march?armv7ve=mcpu?cortex-a12
- MULTILIB_MATCHES       += march?armv7ve=mcpu?cortex-a17
-@@ -93,6 +88,9 @@ MULTILIB_MATCHES       += march?armv8-a=mcpu?cortex-a57
- MULTILIB_MATCHES       += march?armv8-a=mcpu?cortex-a57.cortex-a53
- MULTILIB_MATCHES       += march?armv8-a=mcpu?cortex-a72
- MULTILIB_MATCHES       += march?armv8-a=mcpu?cortex-a72.cortex-a53
-+MULTILIB_MATCHES       += march?armv8-a=mcpu?cortex-a73
-+MULTILIB_MATCHES       += march?armv8-a=mcpu?cortex-a73.cortex-a35
-+MULTILIB_MATCHES       += march?armv8-a=mcpu?cortex-a73.cortex-a53
- MULTILIB_MATCHES       += march?armv8-a=mcpu?exynos-m1
- MULTILIB_MATCHES       += march?armv8-a=mcpu?qdf24xx
- MULTILIB_MATCHES       += march?armv8-a=mcpu?xgene1
-@@ -101,13 +99,20 @@ MULTILIB_MATCHES       += march?armv8-a=mcpu?xgene1
- MULTILIB_MATCHES       += march?armv8-a=march?armv8-a+crc
- MULTILIB_MATCHES       += march?armv8-a=march?armv8.1-a
- MULTILIB_MATCHES       += march?armv8-a=march?armv8.1-a+crc
-+MULTILIB_MATCHES       += march?armv8-a=march?armv8.2-a
-+MULTILIB_MATCHES       += march?armv8-a=march?armv8.2-a+fp16
- 
- # FPU matches
- MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?vfpv3
- MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?vfpv3-fp16
--MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?vfpv3-fp16-d16
-+MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?vfpv3-d16-fp16
-+MULTILIB_MATCHES       += mfpu?neon=mfpu?neon-fp16
- MULTILIB_MATCHES       += mfpu?vfpv4-d16=mfpu?vfpv4
-+MULTILIB_MATCHES       += mfpu?vfpv4-d16=mfpu?fpv5-d16
-+MULTILIB_MATCHES       += mfpu?vfpv4-d16=mfpu?fp-armv8
- MULTILIB_MATCHES       += mfpu?neon-fp-armv8=mfpu?crypto-neon-fp-armv8
-+MULTILIB_MATCHES       += mfpu?vfp=mfpu?vfpv2
-+MULTILIB_MATCHES       += mfpu?neon=mfpu?neon-vfpv3
- 
- 
- # Map all requests for vfpv3 with a later CPU to vfpv3-d16 v7-a.
-@@ -124,10 +129,6 @@ MULTILIB_REUSE	      += march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.hard=march.armv8
- MULTILIB_REUSE	      += march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.softfp=march.armv8-a/mfpu.vfpv3-d16/mfloat-abi.softfp
- MULTILIB_REUSE	      += march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.hard=march.armv7-a/mfpu.vfpv4-d16/mfloat-abi.hard
- MULTILIB_REUSE	      += march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.softfp=march.armv7-a/mfpu.vfpv4-d16/mfloat-abi.softfp
--MULTILIB_REUSE	      += march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.hard=march.armv7-a/mfpu.fp-armv8/mfloat-abi.hard
--MULTILIB_REUSE	      += march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.softfp=march.armv7-a/mfpu.fp-armv8/mfloat-abi.softfp
--MULTILIB_REUSE	      += march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.hard=march.armv7-a/mfpu.vfpv4/mfloat-abi.hard
--MULTILIB_REUSE	      += march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.softfp=march.armv7-a/mfpu.vfpv4/mfloat-abi.softfp
- 
- 
- MULTILIB_REUSE	      += march.armv7-a/mfpu.neon/mfloat-abi.hard=march.armv7ve/mfpu.neon/mfloat-abi.hard
-@@ -140,10 +141,6 @@ MULTILIB_REUSE	      += march.armv7-a/mfpu.neon/mfloat-abi.hard=march.armv7-a/mf
- MULTILIB_REUSE	      += march.armv7-a/mfpu.neon/mfloat-abi.softfp=march.armv7-a/mfpu.neon-fp-armv8/mfloat-abi.softfp
- 
- 
--MULTILIB_REUSE	      += march.armv7ve/mfpu.vfpv4-d16/mfloat-abi.hard=march.armv7ve/mfpu.fp-armv8/mfloat-abi.hard
--MULTILIB_REUSE	      += march.armv7ve/mfpu.vfpv4-d16/mfloat-abi.softfp=march.armv7ve/mfpu.fp-armv8/mfloat-abi.softfp
--MULTILIB_REUSE	      += march.armv7ve/mfpu.vfpv4-d16/mfloat-abi.hard=march.armv8-a/mfpu.vfpv4/mfloat-abi.hard
--MULTILIB_REUSE	      += march.armv7ve/mfpu.vfpv4-d16/mfloat-abi.softfp=march.armv8-a/mfpu.vfpv4/mfloat-abi.softfp
- MULTILIB_REUSE	      += march.armv7ve/mfpu.vfpv4-d16/mfloat-abi.hard=march.armv8-a/mfpu.vfpv4-d16/mfloat-abi.hard
- MULTILIB_REUSE	      += march.armv7ve/mfpu.vfpv4-d16/mfloat-abi.softfp=march.armv8-a/mfpu.vfpv4-d16/mfloat-abi.softfp
- 
-@@ -163,10 +160,6 @@ MULTILIB_REUSE	      += mthumb/march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.hard=mthu
- MULTILIB_REUSE	      += mthumb/march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.softfp=mthumb/march.armv8-a/mfpu.vfpv3-d16/mfloat-abi.softfp
- MULTILIB_REUSE	      += mthumb/march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.hard=mthumb/march.armv7-a/mfpu.vfpv4-d16/mfloat-abi.hard
- MULTILIB_REUSE	      += mthumb/march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.softfp=mthumb/march.armv7-a/mfpu.vfpv4-d16/mfloat-abi.softfp
--MULTILIB_REUSE	      += mthumb/march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.hard=mthumb/march.armv7-a/mfpu.fp-armv8/mfloat-abi.hard
--MULTILIB_REUSE	      += mthumb/march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.softfp=mthumb/march.armv7-a/mfpu.fp-armv8/mfloat-abi.softfp
--MULTILIB_REUSE	      += mthumb/march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.hard=mthumb/march.armv7-a/mfpu.vfpv4/mfloat-abi.hard
--MULTILIB_REUSE	      += mthumb/march.armv7-a/mfpu.vfpv3-d16/mfloat-abi.softfp=mthumb/march.armv7-a/mfpu.vfpv4/mfloat-abi.softfp
- 
- 
- MULTILIB_REUSE	      += mthumb/march.armv7-a/mfpu.neon/mfloat-abi.hard=mthumb/march.armv7ve/mfpu.neon/mfloat-abi.hard
-@@ -179,10 +172,6 @@ MULTILIB_REUSE	      += mthumb/march.armv7-a/mfpu.neon/mfloat-abi.hard=mthumb/ma
- MULTILIB_REUSE	      += mthumb/march.armv7-a/mfpu.neon/mfloat-abi.softfp=mthumb/march.armv7-a/mfpu.neon-fp-armv8/mfloat-abi.softfp
- 
- 
--MULTILIB_REUSE	      += mthumb/march.armv7ve/mfpu.vfpv4-d16/mfloat-abi.hard=mthumb/march.armv7ve/mfpu.fp-armv8/mfloat-abi.hard
--MULTILIB_REUSE	      += mthumb/march.armv7ve/mfpu.vfpv4-d16/mfloat-abi.softfp=mthumb/march.armv7ve/mfpu.fp-armv8/mfloat-abi.softfp
--MULTILIB_REUSE	      += mthumb/march.armv7ve/mfpu.vfpv4-d16/mfloat-abi.hard=mthumb/march.armv8-a/mfpu.vfpv4/mfloat-abi.hard
--MULTILIB_REUSE	      += mthumb/march.armv7ve/mfpu.vfpv4-d16/mfloat-abi.softfp=mthumb/march.armv8-a/mfpu.vfpv4/mfloat-abi.softfp
- MULTILIB_REUSE	      += mthumb/march.armv7ve/mfpu.vfpv4-d16/mfloat-abi.hard=mthumb/march.armv8-a/mfpu.vfpv4-d16/mfloat-abi.hard
- MULTILIB_REUSE	      += mthumb/march.armv7ve/mfpu.vfpv4-d16/mfloat-abi.softfp=mthumb/march.armv8-a/mfpu.vfpv4-d16/mfloat-abi.softfp
- 
---- a/src/gcc/config/arm/t-arm
-+++ b/src/gcc/config/arm/t-arm
-@@ -95,7 +95,8 @@ arm.o: $(srcdir)/config/arm/arm.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
-   $(srcdir)/config/arm/arm-cores.def \
-   $(srcdir)/config/arm/arm-arches.def $(srcdir)/config/arm/arm-fpus.def \
-   $(srcdir)/config/arm/arm-protos.h \
--  $(srcdir)/config/arm/arm_neon_builtins.def
-+  $(srcdir)/config/arm/arm_neon_builtins.def \
-+  $(srcdir)/config/arm/arm_vfp_builtins.def
- 
- arm-builtins.o: $(srcdir)/config/arm/arm-builtins.c $(CONFIG_H) \
-   $(SYSTEM_H) coretypes.h $(TM_H) \
-@@ -103,6 +104,7 @@ arm-builtins.o: $(srcdir)/config/arm/arm-builtins.c $(CONFIG_H) \
-   $(DIAGNOSTIC_CORE_H) $(OPTABS_H) \
-   $(srcdir)/config/arm/arm-protos.h \
-   $(srcdir)/config/arm/arm_neon_builtins.def \
-+  $(srcdir)/config/arm/arm_vfp_builtins.def \
-   $(srcdir)/config/arm/arm-simd-builtin-types.def
- 	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
- 		$(srcdir)/config/arm/arm-builtins.c
---- /dev/null
-+++ b/src/gcc/config/arm/t-rmprofile
-@@ -0,0 +1,176 @@
-+# Copyright (C) 2016 Free Software Foundation, Inc.
-+#
-+# This file is part of GCC.
-+#
-+# GCC is free software; you can redistribute it and/or modify
-+# it under the terms of the GNU General Public License as published by
-+# the Free Software Foundation; either version 3, or (at your option)
-+# any later version.
-+#
-+# GCC is distributed in the hope that it will be useful,
-+# but WITHOUT ANY WARRANTY; without even the implied warranty of
-+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-+# GNU General Public License for more details.
-+#
-+# You should have received a copy of the GNU General Public License
-+# along with GCC; see the file COPYING3.  If not see
-+# <http://www.gnu.org/licenses/>.
-+
-+# This is a target makefile fragment that attempts to get
-+# multilibs built for the range of CPU's, FPU's and ABI's that
-+# are relevant for the ARM architecture.  It should not be used in
-+# conjunction with another make file fragment and assumes --with-arch,
-+# --with-cpu, --with-fpu, --with-float, --with-mode have their default
-+# values during the configure step.  We enforce this during the
-+# top-level configury.
-+
-+MULTILIB_OPTIONS     =
-+MULTILIB_DIRNAMES    =
-+MULTILIB_EXCEPTIONS  =
-+MULTILIB_MATCHES     =
-+MULTILIB_REUSE       =
-+
-+# We have the following hierachy:
-+#   ISA: A32 (.) or T16/T32 (thumb).
-+#   Architecture: ARMv6S-M (v6-m), ARMv7-M (v7-m), ARMv7E-M (v7e-m),
-+#                 ARMv8-M Baseline (v8-m.base) or ARMv8-M Mainline (v8-m.main).
-+#   FPU: VFPv3-D16 (fpv3), FPV4-SP-D16 (fpv4-sp), FPV5-SP-D16 (fpv5-sp),
-+#        VFPv5-D16 (fpv5), or None (.).
-+#   Float-abi: Soft (.), softfp (softfp), or hard (hardfp).
-+
-+# Options to build libraries with
-+
-+MULTILIB_OPTIONS       += mthumb
-+MULTILIB_DIRNAMES      += thumb
-+
-+MULTILIB_OPTIONS       += march=armv6s-m/march=armv7-m/march=armv7e-m/march=armv7/march=armv8-m.base/march=armv8-m.main
-+MULTILIB_DIRNAMES      += v6-m v7-m v7e-m v7-ar v8-m.base v8-m.main
-+
-+MULTILIB_OPTIONS       += mfpu=vfpv3-d16/mfpu=fpv4-sp-d16/mfpu=fpv5-sp-d16/mfpu=fpv5-d16
-+MULTILIB_DIRNAMES      += fpv3 fpv4-sp fpv5-sp fpv5
-+
-+MULTILIB_OPTIONS       += mfloat-abi=softfp/mfloat-abi=hard
-+MULTILIB_DIRNAMES      += softfp hard
-+
-+
-+# Option combinations to build library with
-+
-+# Default CPU/Arch
-+MULTILIB_REQUIRED      += mthumb
-+MULTILIB_REQUIRED      += mfloat-abi=hard
-+
-+# ARMv6-M
-+MULTILIB_REQUIRED      += mthumb/march=armv6s-m
-+
-+# ARMv8-M Baseline
-+MULTILIB_REQUIRED      += mthumb/march=armv8-m.base
-+
-+# ARMv7-M
-+MULTILIB_REQUIRED      += mthumb/march=armv7-m
-+
-+# ARMv7E-M
-+MULTILIB_REQUIRED      += mthumb/march=armv7e-m
-+MULTILIB_REQUIRED      += mthumb/march=armv7e-m/mfpu=fpv4-sp-d16/mfloat-abi=softfp
-+MULTILIB_REQUIRED      += mthumb/march=armv7e-m/mfpu=fpv4-sp-d16/mfloat-abi=hard
-+MULTILIB_REQUIRED      += mthumb/march=armv7e-m/mfpu=fpv5-d16/mfloat-abi=softfp
-+MULTILIB_REQUIRED      += mthumb/march=armv7e-m/mfpu=fpv5-d16/mfloat-abi=hard
-+MULTILIB_REQUIRED      += mthumb/march=armv7e-m/mfpu=fpv5-sp-d16/mfloat-abi=softfp
-+MULTILIB_REQUIRED      += mthumb/march=armv7e-m/mfpu=fpv5-sp-d16/mfloat-abi=hard
-+
-+# ARMv8-M Mainline
-+MULTILIB_REQUIRED      += mthumb/march=armv8-m.main
-+MULTILIB_REQUIRED      += mthumb/march=armv8-m.main/mfpu=fpv5-d16/mfloat-abi=softfp
-+MULTILIB_REQUIRED      += mthumb/march=armv8-m.main/mfpu=fpv5-d16/mfloat-abi=hard
-+MULTILIB_REQUIRED      += mthumb/march=armv8-m.main/mfpu=fpv5-sp-d16/mfloat-abi=softfp
-+MULTILIB_REQUIRED      += mthumb/march=armv8-m.main/mfpu=fpv5-sp-d16/mfloat-abi=hard
-+
-+# ARMv7-R as well as ARMv7-A and ARMv8-A if aprofile was not specified
-+MULTILIB_REQUIRED      += mthumb/march=armv7
-+MULTILIB_REQUIRED      += mthumb/march=armv7/mfpu=vfpv3-d16/mfloat-abi=softfp
-+MULTILIB_REQUIRED      += mthumb/march=armv7/mfpu=vfpv3-d16/mfloat-abi=hard
-+
-+
-+# Matches
-+
-+# CPU Matches
-+MULTILIB_MATCHES       += march?armv6s-m=mcpu?cortex-m0
-+MULTILIB_MATCHES       += march?armv6s-m=mcpu?cortex-m0.small-multiply
-+MULTILIB_MATCHES       += march?armv6s-m=mcpu?cortex-m0plus
-+MULTILIB_MATCHES       += march?armv6s-m=mcpu?cortex-m0plus.small-multiply
-+MULTILIB_MATCHES       += march?armv6s-m=mcpu?cortex-m1
-+MULTILIB_MATCHES       += march?armv6s-m=mcpu?cortex-m1.small-multiply
-+MULTILIB_MATCHES       += march?armv7-m=mcpu?cortex-m3
-+MULTILIB_MATCHES       += march?armv7e-m=mcpu?cortex-m4
-+MULTILIB_MATCHES       += march?armv7e-m=mcpu?cortex-m7
-+MULTILIB_MATCHES       += march?armv8-m.base=mcpu?cortex-m23
-+MULTILIB_MATCHES       += march?armv8-m.main=mcpu?cortex-m33
-+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-r4
-+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-r4f
-+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-r5
-+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-r7
-+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-r8
-+MULTILIB_MATCHES       += march?armv7=mcpu?marvell-pj4
-+MULTILIB_MATCHES       += march?armv7=mcpu?generic-armv7-a
-+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a8
-+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a9
-+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a5
-+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a7
-+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a15
-+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a12
-+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a17
-+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a15.cortex-a7
-+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a17.cortex-a7
-+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a32
-+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a35
-+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a53
-+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a57
-+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a57.cortex-a53
-+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a72
-+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a72.cortex-a53
-+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a73
-+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a73.cortex-a35
-+MULTILIB_MATCHES       += march?armv7=mcpu?cortex-a73.cortex-a53
-+MULTILIB_MATCHES       += march?armv7=mcpu?exynos-m1
-+MULTILIB_MATCHES       += march?armv7=mcpu?qdf24xx
-+MULTILIB_MATCHES       += march?armv7=mcpu?xgene1
-+
-+# Arch Matches
-+MULTILIB_MATCHES       += march?armv6s-m=march?armv6-m
-+MULTILIB_MATCHES       += march?armv8-m.main=march?armv8-m.main+dsp
-+MULTILIB_MATCHES       += march?armv7=march?armv7-r
-+ifeq (,$(HAS_APROFILE))
-+MULTILIB_MATCHES       += march?armv7=march?armv7-a
-+MULTILIB_MATCHES       += march?armv7=march?armv7ve
-+MULTILIB_MATCHES       += march?armv7=march?armv8-a
-+MULTILIB_MATCHES       += march?armv7=march?armv8-a+crc
-+MULTILIB_MATCHES       += march?armv7=march?armv8.1-a
-+MULTILIB_MATCHES       += march?armv7=march?armv8.1-a+crc
-+MULTILIB_MATCHES       += march?armv7=march?armv8.2-a
-+MULTILIB_MATCHES       += march?armv7=march?armv8.2-a+fp16
-+endif
-+
-+# FPU matches
-+ifeq (,$(HAS_APROFILE))
-+MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?vfpv3
-+MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?vfpv3-fp16
-+MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?vfpv3-d16-fp16
-+MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?neon
-+MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?neon-fp16
-+MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?vfpv4
-+MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?vfpv4-d16
-+MULTILIB_MATCHES       += mfpu?vfpv3-d16=mfpu?neon-vfpv4
-+MULTILIB_MATCHES       += mfpu?fpv5-d16=mfpu?fp-armv8
-+MULTILIB_MATCHES       += mfpu?fpv5-d16=mfpu?neon-fp-armv8
-+MULTILIB_MATCHES       += mfpu?fpv5-d16=mfpu?crypto-neon-fp-armv8
-+endif
-+
-+
-+# We map all requests for ARMv7-R or ARMv7-A in ARM mode to Thumb mode and
-+# any FPU to VFPv3-d16 if possible.
-+MULTILIB_REUSE         += mthumb/march.armv7=march.armv7
-+MULTILIB_REUSE         += mthumb/march.armv7/mfpu.vfpv3-d16/mfloat-abi.softfp=march.armv7/mfpu.vfpv3-d16/mfloat-abi.softfp
-+MULTILIB_REUSE         += mthumb/march.armv7/mfpu.vfpv3-d16/mfloat-abi.hard=march.armv7/mfpu.vfpv3-d16/mfloat-abi.hard
-+MULTILIB_REUSE         += mthumb/march.armv7/mfpu.vfpv3-d16/mfloat-abi.softfp=march.armv7/mfpu.fpv5-d16/mfloat-abi.softfp
-+MULTILIB_REUSE         += mthumb/march.armv7/mfpu.vfpv3-d16/mfloat-abi.hard=march.armv7/mfpu.fpv5-d16/mfloat-abi.hard
-+MULTILIB_REUSE         += mthumb/march.armv7/mfpu.vfpv3-d16/mfloat-abi.softfp=mthumb/march.armv7/mfpu.fpv5-d16/mfloat-abi.softfp
-+MULTILIB_REUSE         += mthumb/march.armv7/mfpu.vfpv3-d16/mfloat-abi.hard=mthumb/march.armv7/mfpu.fpv5-d16/mfloat-abi.hard
---- a/src/gcc/config/arm/thumb1.md
-+++ b/src/gcc/config/arm/thumb1.md
-@@ -55,6 +55,10 @@
-    (set_attr "type" "multiple")]
- )
- 
-+;; Changes to the constraints of this pattern must be propagated to those of
-+;; atomic additions in sync.md and to the logic for bind_old_new in
-+;; arm_split_atomic_op in arm.c.  These must be at least as strict as the
-+;; constraints here and aim to be as permissive.
- (define_insn_and_split "*thumb1_addsi3"
-   [(set (match_operand:SI          0 "register_operand" "=l,l,l,*rk,*hk,l,k,l,l,l")
- 	(plus:SI (match_operand:SI 1 "register_operand" "%0,0,l,*0,*0,k,k,0,l,k")
-@@ -114,8 +118,8 @@
-    (set (match_dup 0)
- 	(plus:SI (match_dup 0) (reg:SI SP_REGNUM)))]
-   "TARGET_THUMB1
--   && (unsigned HOST_WIDE_INT) (INTVAL (operands[1])) < 1024
--   && (INTVAL (operands[1]) & 3) == 0"
-+   && UINTVAL (operands[1]) < 1024
-+   && (UINTVAL (operands[1]) & 3) == 0"
-   [(set (match_dup 0) (plus:SI (reg:SI SP_REGNUM) (match_dup 1)))]
-   ""
- )
-@@ -131,6 +135,10 @@
-    (set_attr "type" "multiple")]
- )
- 
-+;; Changes to the constraints of this pattern must be propagated to those of
-+;; atomic subtractions in sync.md and to the logic for bind_old_new in
-+;; arm_split_atomic_op in arm.c.  These must be at least as strict as the
-+;; constraints here and aim to be as permissive.
- (define_insn "thumb1_subsi3_insn"
-   [(set (match_operand:SI           0 "register_operand" "=l")
- 	(minus:SI (match_operand:SI 1 "register_operand" "l")
-@@ -142,11 +150,11 @@
-    (set_attr "type" "alus_sreg")]
- )
- 
--; Unfortunately with the Thumb the '&'/'0' trick can fails when operands
--; 1 and 2; are the same, because reload will make operand 0 match
--; operand 1 without realizing that this conflicts with operand 2.  We fix
--; this by adding another alternative to match this case, and then `reload'
--; it ourselves.  This alternative must come first.
-+;; Unfortunately on Thumb the '&'/'0' trick can fail when operands
-+;; 1 and 2 are the same, because reload will make operand 0 match
-+;; operand 1 without realizing that this conflicts with operand 2.  We fix
-+;; this by adding another alternative to match this case, and then `reload'
-+;; it ourselves.  This alternative must come first.
- (define_insn "*thumb_mulsi3"
-   [(set (match_operand:SI          0 "register_operand" "=&l,&l,&l")
- 	(mult:SI (match_operand:SI 1 "register_operand" "%l,*h,0")
-@@ -173,6 +181,10 @@
-    (set_attr "type" "muls")]
- )
- 
-+;; Changes to the constraints of this pattern must be propagated to those of
-+;; atomic bitwise ANDs and NANDs in sync.md and to the logic for bind_old_new
-+;; in arm_split_atomic_op in arm.c.  These must be at least as strict as the
-+;; constraints here and aim to be as permissive.
- (define_insn "*thumb1_andsi3_insn"
-   [(set (match_operand:SI         0 "register_operand" "=l")
- 	(and:SI (match_operand:SI 1 "register_operand" "%0")
-@@ -227,6 +239,10 @@
-    (set_attr "type" "logics_reg")]
- )
- 
-+;; Changes to the constraints of this pattern must be propagated to those of
-+;; atomic inclusive ORs in sync.md and to the logic for bind_old_new in
-+;; arm_split_atomic_op in arm.c.  These must be at least as strict as the
-+;; constraints here and aim to be as permissive.
- (define_insn "*thumb1_iorsi3_insn"
-   [(set (match_operand:SI         0 "register_operand" "=l")
- 	(ior:SI (match_operand:SI 1 "register_operand" "%0")
-@@ -237,6 +253,10 @@
-    (set_attr "conds" "set")
-    (set_attr "type" "logics_reg")])
- 
-+;; Changes to the constraints of this pattern must be propagated to those of
-+;; atomic exclusive ORs in sync.md and to the logic for bind_old_new in
-+;; arm_split_atomic_op in arm.c.  These must be at least as strict as the
-+;; constraints here and aim to be as permissive.
- (define_insn "*thumb1_xorsi3_insn"
-   [(set (match_operand:SI         0 "register_operand" "=l")
- 	(xor:SI (match_operand:SI 1 "register_operand" "%0")
-@@ -590,8 +610,8 @@
- ;;; ??? The 'i' constraint looks funny, but it should always be replaced by
- ;;; thumb_reorg with a memory reference.
- (define_insn "*thumb1_movdi_insn"
--  [(set (match_operand:DI 0 "nonimmediate_operand" "=l,l,l,l,>,l, m,*r")
--	(match_operand:DI 1 "general_operand"      "l, I,J,>,l,mi,l,*r"))]
-+  [(set (match_operand:DI 0 "nonimmediate_operand" "=l,l,l,r,l,>,l, m,*r")
-+	(match_operand:DI 1 "general_operand"      "l, I,J,j,>,l,mi,l,*r"))]
-   "TARGET_THUMB1
-    && (   register_operand (operands[0], DImode)
-        || register_operand (operands[1], DImode))"
-@@ -610,36 +630,41 @@
-       operands[1] = GEN_INT (- INTVAL (operands[1]));
-       return \"movs\\t%Q0, %1\;rsbs\\t%Q0, %Q0, #0\;asrs\\t%R0, %Q0, #31\";
-     case 3:
--      return \"ldmia\\t%1, {%0, %H0}\";
-+      gcc_assert (TARGET_HAVE_MOVT);
-+      return \"movw\\t%Q0, %L1\;movs\\tR0, #0\";
-     case 4:
--      return \"stmia\\t%0, {%1, %H1}\";
-+      return \"ldmia\\t%1, {%0, %H0}\";
-     case 5:
--      return thumb_load_double_from_address (operands);
-+      return \"stmia\\t%0, {%1, %H1}\";
-     case 6:
-+      return thumb_load_double_from_address (operands);
-+    case 7:
-       operands[2] = gen_rtx_MEM (SImode,
- 			     plus_constant (Pmode, XEXP (operands[0], 0), 4));
-       output_asm_insn (\"str\\t%1, %0\;str\\t%H1, %2\", operands);
-       return \"\";
--    case 7:
-+    case 8:
-       if (REGNO (operands[1]) == REGNO (operands[0]) + 1)
- 	return \"mov\\t%0, %1\;mov\\t%H0, %H1\";
-       return \"mov\\t%H0, %H1\;mov\\t%0, %1\";
-     }
-   }"
--  [(set_attr "length" "4,4,6,2,2,6,4,4")
--   (set_attr "type" "multiple,multiple,multiple,load2,store2,load2,store2,multiple")
--   (set_attr "pool_range" "*,*,*,*,*,1018,*,*")]
-+  [(set_attr "length" "4,4,6,6,2,2,6,4,4")
-+   (set_attr "type" "multiple,multiple,multiple,multiple,load2,store2,load2,store2,multiple")
-+   (set_attr "arch" "t1,t1,t1,v8mb,t1,t1,t1,t1,t1")
-+   (set_attr "pool_range" "*,*,*,*,*,*,1018,*,*")]
- )
- 
- (define_insn "*thumb1_movsi_insn"
--  [(set (match_operand:SI 0 "nonimmediate_operand" "=l,l,l,l,l,>,l, m,*l*h*k")
--	(match_operand:SI 1 "general_operand"      "l, I,J,K,>,l,mi,l,*l*h*k"))]
-+  [(set (match_operand:SI 0 "nonimmediate_operand" "=l,l,r,l,l,l,>,l, m,*l*h*k")
-+	(match_operand:SI 1 "general_operand"      "l, I,j,J,K,>,l,mi,l,*l*h*k"))]
-   "TARGET_THUMB1
-    && (   register_operand (operands[0], SImode)
-        || register_operand (operands[1], SImode))"
-   "@
-    movs	%0, %1
-    movs	%0, %1
-+   movw	%0, %1
-    #
-    #
-    ldmia\\t%1, {%0}
-@@ -647,10 +672,11 @@
-    ldr\\t%0, %1
-    str\\t%1, %0
-    mov\\t%0, %1"
--  [(set_attr "length" "2,2,4,4,2,2,2,2,2")
--   (set_attr "type" "mov_reg,mov_imm,multiple,multiple,load1,store1,load1,store1,mov_reg")
--   (set_attr "pool_range" "*,*,*,*,*,*,1018,*,*")
--   (set_attr "conds" "set,clob,*,*,nocond,nocond,nocond,nocond,nocond")])
-+  [(set_attr "length" "2,2,4,4,4,2,2,2,2,2")
-+   (set_attr "type" "mov_reg,mov_imm,mov_imm,multiple,multiple,load1,store1,load1,store1,mov_reg")
-+   (set_attr "pool_range" "*,*,*,*,*,*,*,1018,*,*")
-+   (set_attr "arch" "t1,t1,v8mb,t1,t1,t1,t1,t1,t1,t1")
-+   (set_attr "conds" "set,clob,nocond,*,*,nocond,nocond,nocond,nocond,nocond")])
- 
- ; Split the load of 64-bit constant into two loads for high and low 32-bit parts respectively
- ; to see if we can load them in fewer instructions or fewer cycles.
-@@ -687,7 +713,8 @@
- (define_split
-   [(set (match_operand:SI 0 "register_operand" "")
- 	(match_operand:SI 1 "const_int_operand" ""))]
--  "TARGET_THUMB1 && satisfies_constraint_K (operands[1])"
-+  "TARGET_THUMB1 && satisfies_constraint_K (operands[1])
-+   && !(TARGET_HAVE_MOVT && satisfies_constraint_j (operands[1]))"
-   [(set (match_dup 2) (match_dup 1))
-    (set (match_dup 0) (ashift:SI (match_dup 2) (match_dup 3)))]
-   "
-@@ -714,7 +741,8 @@
- (define_split
-   [(set (match_operand:SI 0 "register_operand" "")
- 	(match_operand:SI 1 "const_int_operand" ""))]
--  "TARGET_THUMB1 && satisfies_constraint_Pe (operands[1])"
-+  "TARGET_THUMB1 && satisfies_constraint_Pe (operands[1])
-+   && !(TARGET_HAVE_MOVT && satisfies_constraint_j (operands[1]))"
-   [(set (match_dup 2) (match_dup 1))
-    (set (match_dup 0) (plus:SI (match_dup 2) (match_dup 3)))]
-   "
-@@ -726,8 +754,8 @@
- )
- 
- (define_insn "*thumb1_movhi_insn"
--  [(set (match_operand:HI 0 "nonimmediate_operand" "=l,l,m,l*r,*h,l")
--	(match_operand:HI 1 "general_operand"       "l,m,l,k*h,*r,I"))]
-+  [(set (match_operand:HI 0 "nonimmediate_operand" "=l,l,m,l*r,*h,l,r")
-+	(match_operand:HI 1 "general_operand"       "l,m,l,k*h,*r,I,n"))]
-   "TARGET_THUMB1
-    && (   register_operand (operands[0], HImode)
-        || register_operand (operands[1], HImode))"
-@@ -739,6 +767,8 @@
-     case 3: return \"mov	%0, %1\";
-     case 4: return \"mov	%0, %1\";
-     case 5: return \"movs	%0, %1\";
-+    case 6: gcc_assert (TARGET_HAVE_MOVT);
-+	    return \"movw	%0, %L1\";
-     default: gcc_unreachable ();
-     case 1:
-       /* The stack pointer can end up being taken as an index register.
-@@ -758,9 +788,10 @@
- 	}
-       return \"ldrh	%0, %1\";
-     }"
--  [(set_attr "length" "2,4,2,2,2,2")
--   (set_attr "type" "alus_imm,load1,store1,mov_reg,mov_reg,mov_imm")
--   (set_attr "conds" "clob,nocond,nocond,nocond,nocond,clob")])
-+  [(set_attr "length" "2,4,2,2,2,2,4")
-+   (set_attr "type" "alus_imm,load1,store1,mov_reg,mov_reg,mov_imm,mov_imm")
-+   (set_attr "arch" "t1,t1,t1,t1,t1,t1,v8mb")
-+   (set_attr "conds" "clob,nocond,nocond,nocond,nocond,clob,nocond")])
- 
- (define_expand "thumb_movhi_clobber"
-   [(set (match_operand:HI     0 "memory_operand"   "")
-@@ -963,6 +994,94 @@
-   DONE;
- })
- 
-+;; A pattern for the CB(N)Z instruction added in ARMv8-M Baseline profile,
-+;; adapted from cbranchsi4_insn.  Modifying cbranchsi4_insn instead leads to
-+;; code generation difference for ARMv6-M because the minimum length of the
-+;; instruction becomes 2 even for ARMv6-M due to a limitation in genattrtab's
-+;; handling of PC in the length condition.
-+(define_insn "thumb1_cbz"
-+  [(set (pc) (if_then_else
-+	      (match_operator 0 "equality_operator"
-+	       [(match_operand:SI 1 "s_register_operand" "l")
-+		(const_int 0)])
-+	      (label_ref (match_operand 2 "" ""))
-+	      (pc)))]
-+  "TARGET_THUMB1 && TARGET_HAVE_CBZ"
-+{
-+  if (get_attr_length (insn) == 2)
-+    {
-+      if (GET_CODE (operands[0]) == EQ)
-+	return "cbz\t%1, %l2";
-+      else
-+	return "cbnz\t%1, %l2";
-+    }
-+  else
-+    {
-+      rtx t = cfun->machine->thumb1_cc_insn;
-+      if (t != NULL_RTX)
-+	{
-+	  if (!rtx_equal_p (cfun->machine->thumb1_cc_op0, operands[1])
-+	      || !rtx_equal_p (cfun->machine->thumb1_cc_op1, operands[2]))
-+	    t = NULL_RTX;
-+	  if (cfun->machine->thumb1_cc_mode == CC_NOOVmode)
-+	    {
-+	      if (!noov_comparison_operator (operands[0], VOIDmode))
-+		t = NULL_RTX;
-+	    }
-+	  else if (cfun->machine->thumb1_cc_mode != CCmode)
-+	    t = NULL_RTX;
-+	}
-+      if (t == NULL_RTX)
-+	{
-+	  output_asm_insn ("cmp\t%1, #0", operands);
-+	  cfun->machine->thumb1_cc_insn = insn;
-+	  cfun->machine->thumb1_cc_op0 = operands[1];
-+	  cfun->machine->thumb1_cc_op1 = operands[2];
-+	  cfun->machine->thumb1_cc_mode = CCmode;
-+	}
-+      else
-+	/* Ensure we emit the right type of condition code on the jump.  */
-+	XEXP (operands[0], 0) = gen_rtx_REG (cfun->machine->thumb1_cc_mode,
-+					     CC_REGNUM);
-+
-+      switch (get_attr_length (insn))
-+	{
-+	case 4:  return "b%d0\t%l2";
-+	case 6:  return "b%D0\t.LCB%=;b\t%l2\t%@long jump\n.LCB%=:";
-+	case 8:  return "b%D0\t.LCB%=;bl\t%l2\t%@far jump\n.LCB%=:";
-+	default: gcc_unreachable ();
-+	}
-+    }
-+}
-+  [(set (attr "far_jump")
-+	(if_then_else
-+	    (eq_attr "length" "8")
-+	    (const_string "yes")
-+	    (const_string "no")))
-+   (set (attr "length")
-+	(if_then_else
-+	    (and (ge (minus (match_dup 2) (pc)) (const_int 2))
-+		 (le (minus (match_dup 2) (pc)) (const_int 128)))
-+	    (const_int 2)
-+	    (if_then_else
-+		(and (ge (minus (match_dup 2) (pc)) (const_int -250))
-+		     (le (minus (match_dup 2) (pc)) (const_int 256)))
-+		(const_int 4)
-+		(if_then_else
-+		    (and (ge (minus (match_dup 2) (pc)) (const_int -2040))
-+			 (le (minus (match_dup 2) (pc)) (const_int 2048)))
-+		    (const_int 6)
-+		    (const_int 8)))))
-+   (set (attr "type")
-+	(if_then_else
-+	    (eq_attr "length" "2")
-+	    (const_string "branch")
-+	    (const_string "multiple")))]
-+)
-+
-+;; Changes to the constraints of this pattern must be propagated to those of
-+;; atomic compare_and_swap splitters in sync.md.  These must be at least as
-+;; strict as the constraints here and aim to be as permissive.
- (define_insn "cbranchsi4_insn"
-   [(set (pc) (if_then_else
- 	      (match_operator 0 "arm_comparison_operator"
-@@ -1024,6 +1143,9 @@
-    (set_attr "type" "multiple")]
- )
- 
-+;; Changes to the constraints of this pattern must be propagated to those of
-+;; atomic compare_and_swap splitters in sync.md.  These must be at least as
-+;; strict as the constraints here and aim to be as permissive.
- (define_insn "cbranchsi4_scratch"
-   [(set (pc) (if_then_else
- 	      (match_operator 4 "arm_comparison_operator"
-@@ -1609,6 +1731,19 @@
-    (set_attr "type" "call")]
- )
- 
-+(define_insn "*nonsecure_call_reg_thumb1_v5"
-+  [(call (unspec:SI [(mem:SI (match_operand:SI 0 "register_operand" "l*r"))]
-+		    UNSPEC_NONSECURE_MEM)
-+	 (match_operand 1 "" ""))
-+   (use (match_operand 2 "" ""))
-+   (clobber (reg:SI LR_REGNUM))
-+   (clobber (match_dup 0))]
-+  "TARGET_THUMB1 && use_cmse && !SIBLING_CALL_P (insn)"
-+  "bl\\t__gnu_cmse_nonsecure_call"
-+  [(set_attr "length" "4")
-+   (set_attr "type" "call")]
-+)
-+
- (define_insn "*call_reg_thumb1"
-   [(call (mem:SI (match_operand:SI 0 "register_operand" "l*r"))
- 	 (match_operand 1 "" ""))
-@@ -1641,6 +1776,21 @@
-    (set_attr "type" "call")]
- )
- 
-+(define_insn "*nonsecure_call_value_reg_thumb1_v5"
-+  [(set (match_operand 0 "" "")
-+	(call (unspec:SI
-+	       [(mem:SI (match_operand:SI 1 "register_operand" "l*r"))]
-+	       UNSPEC_NONSECURE_MEM)
-+	      (match_operand 2 "" "")))
-+   (use (match_operand 3 "" ""))
-+   (clobber (reg:SI LR_REGNUM))
-+   (clobber (match_dup 1))]
-+  "TARGET_THUMB1 && use_cmse"
-+  "bl\\t__gnu_cmse_nonsecure_call"
-+  [(set_attr "length" "4")
-+   (set_attr "type" "call")]
-+)
-+
- (define_insn "*call_value_reg_thumb1"
-   [(set (match_operand 0 "" "")
- 	(call (mem:SI (match_operand:SI 1 "register_operand" "l*r"))
-@@ -1747,8 +1897,13 @@
-   "*
-     return thumb1_unexpanded_epilogue ();
-   "
--  ; Length is absolute worst case
--  [(set_attr "length" "44")
-+  ; Length is absolute worst case, when using CMSE and if this is an entry
-+  ; function an extra 4 (MSR) bytes will be added.
-+  [(set (attr "length")
-+	(if_then_else
-+	 (match_test "IS_CMSE_ENTRY (arm_current_func_type ())")
-+	 (const_int 48)
-+	 (const_int 44)))
-    (set_attr "type" "block")
-    ;; We don't clobber the conditions, but the potential length of this
-    ;; operation is sufficient to make conditionalizing the sequence
---- a/src/gcc/config/arm/thumb2.md
-+++ b/src/gcc/config/arm/thumb2.md
-@@ -125,32 +125,6 @@
-    (set_attr "type" "multiple")]
- )
- 
--;; Thumb-2 does not have rsc, so use a clever trick with shifter operands.
--(define_insn_and_split "*thumb2_negdi2"
--  [(set (match_operand:DI         0 "s_register_operand" "=&r,r")
--	(neg:DI (match_operand:DI 1 "s_register_operand"  "?r,0")))
--   (clobber (reg:CC CC_REGNUM))]
--  "TARGET_THUMB2"
--  "#" ; negs\\t%Q0, %Q1\;sbc\\t%R0, %R1, %R1, lsl #1
--  "&& reload_completed"
--  [(parallel [(set (reg:CC CC_REGNUM)
--		   (compare:CC (const_int 0) (match_dup 1)))
--	      (set (match_dup 0) (minus:SI (const_int 0) (match_dup 1)))])
--   (set (match_dup 2) (minus:SI (minus:SI (match_dup 3)
--                                          (ashift:SI (match_dup 3)
--                                                     (const_int 1)))
--                                (ltu:SI (reg:CC_C CC_REGNUM) (const_int 0))))]
--  {
--    operands[2] = gen_highpart (SImode, operands[0]);
--    operands[0] = gen_lowpart (SImode, operands[0]);
--    operands[3] = gen_highpart (SImode, operands[1]);
--    operands[1] = gen_lowpart (SImode, operands[1]);
--  }
--  [(set_attr "conds" "clob")
--   (set_attr "length" "8")
--   (set_attr "type" "multiple")]
--)
--
- (define_insn_and_split "*thumb2_abssi2"
-   [(set (match_operand:SI         0 "s_register_operand" "=&r,l,r")
- 	(abs:SI (match_operand:SI 1 "s_register_operand" "r,0,0")))
-@@ -278,8 +252,7 @@
- (define_insn "*thumb2_movsi_insn"
-   [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,l,r,r,l ,*hk,m,*m")
- 	(match_operand:SI 1 "general_operand"	   "rk,I,Py,K,j,mi,*mi,l,*hk"))]
--  "TARGET_THUMB2 && ! TARGET_IWMMXT
--   && !(TARGET_HARD_FLOAT && TARGET_VFP)
-+  "TARGET_THUMB2 && !TARGET_IWMMXT && !TARGET_HARD_FLOAT
-    && (   register_operand (operands[0], SImode)
-        || register_operand (operands[1], SImode))"
-   "@
-@@ -581,6 +554,19 @@
-   [(set_attr "type" "call")]
- )
- 
-+(define_insn "*nonsecure_call_reg_thumb2"
-+  [(call (unspec:SI [(mem:SI (match_operand:SI 0 "s_register_operand" "r"))]
-+		    UNSPEC_NONSECURE_MEM)
-+	 (match_operand 1 "" ""))
-+   (use (match_operand 2 "" ""))
-+   (clobber (reg:SI LR_REGNUM))
-+   (clobber (match_dup 0))]
-+  "TARGET_THUMB2 && use_cmse"
-+  "bl\\t__gnu_cmse_nonsecure_call"
-+  [(set_attr "length" "4")
-+   (set_attr "type" "call")]
-+)
-+
- (define_insn "*call_value_reg_thumb2"
-   [(set (match_operand 0 "" "")
- 	(call (mem:SI (match_operand:SI 1 "register_operand" "l*r"))
-@@ -592,6 +578,21 @@
-   [(set_attr "type" "call")]
- )
- 
-+(define_insn "*nonsecure_call_value_reg_thumb2"
-+  [(set (match_operand 0 "" "")
-+	(call
-+	 (unspec:SI [(mem:SI (match_operand:SI 1 "register_operand" "l*r"))]
-+		    UNSPEC_NONSECURE_MEM)
-+	 (match_operand 2 "" "")))
-+   (use (match_operand 3 "" ""))
-+   (clobber (reg:SI LR_REGNUM))
-+   (clobber (match_dup 1))]
-+  "TARGET_THUMB2 && use_cmse"
-+  "bl\t__gnu_cmse_nonsecure_call"
-+  [(set_attr "length" "4")
-+   (set_attr "type" "call")]
-+)
-+
- (define_insn "*thumb2_indirect_jump"
-   [(set (pc)
- 	(match_operand:SI 0 "register_operand" "l*r"))]
-@@ -1115,12 +1116,31 @@
- 
- (define_insn "*thumb2_return"
-   [(simple_return)]
--  "TARGET_THUMB2"
-+  "TARGET_THUMB2 && !IS_CMSE_ENTRY (arm_current_func_type ())"
-   "* return output_return_instruction (const_true_rtx, true, false, true);"
-   [(set_attr "type" "branch")
-    (set_attr "length" "4")]
- )
- 
-+(define_insn "*thumb2_cmse_entry_return"
-+  [(simple_return)]
-+  "TARGET_THUMB2 && IS_CMSE_ENTRY (arm_current_func_type ())"
-+  "* return output_return_instruction (const_true_rtx, true, false, true);"
-+  [(set_attr "type" "branch")
-+   ; This is a return from a cmse_nonsecure_entry function so code will be
-+   ; added to clear the APSR and potentially the FPSCR if VFP is available, so
-+   ; we adapt the length accordingly.
-+   (set (attr "length")
-+     (if_then_else (match_test "TARGET_HARD_FLOAT")
-+      (const_int 12)
-+      (const_int 8)))
-+   ; We do not support predicate execution of returns from cmse_nonsecure_entry
-+   ; functions because we need to clear the APSR.  Since predicable has to be
-+   ; a constant, we had to duplicate the thumb2_return pattern for CMSE entry
-+   ; functions.
-+   (set_attr "predicable" "no")]
-+)
-+
- (define_insn_and_split "thumb2_eh_return"
-   [(unspec_volatile [(match_operand:SI 0 "s_register_operand" "r")]
- 		    VUNSPEC_EH_RETURN)
---- a/src/gcc/config/arm/types.md
-+++ b/src/gcc/config/arm/types.md
-@@ -51,6 +51,7 @@
- ; alus_shift_imm     as alu_shift_imm, setting condition flags.
- ; alus_shift_reg     as alu_shift_reg, setting condition flags.
- ; bfm                bitfield move operation.
-+; bfx                bitfield extract operation.
- ; block              blockage insn, this blocks all functional units.
- ; branch             branch.
- ; call               subroutine call.
-@@ -557,6 +558,7 @@
-   alus_shift_imm,\
-   alus_shift_reg,\
-   bfm,\
-+  bfx,\
-   block,\
-   branch,\
-   call,\
---- a/src/gcc/config/arm/unspecs.md
-+++ b/src/gcc/config/arm/unspecs.md
-@@ -84,6 +84,8 @@
-   UNSPEC_VRINTA         ; Represent a float to integral float rounding
-                         ; towards nearest, ties away from zero.
-   UNSPEC_PROBE_STACK    ; Probe stack memory reference
-+  UNSPEC_NONSECURE_MEM	; Represent non-secure memory in ARMv8-M with
-+			; security extension
- ])
- 
- (define_c_enum "unspec" [
-@@ -191,6 +193,8 @@
-   UNSPEC_VBSL
-   UNSPEC_VCAGE
-   UNSPEC_VCAGT
-+  UNSPEC_VCALE
-+  UNSPEC_VCALT
-   UNSPEC_VCEQ
-   UNSPEC_VCGE
-   UNSPEC_VCGEU
-@@ -203,6 +207,20 @@
-   UNSPEC_VCVT_U
-   UNSPEC_VCVT_S_N
-   UNSPEC_VCVT_U_N
-+  UNSPEC_VCVT_HF_S_N
-+  UNSPEC_VCVT_HF_U_N
-+  UNSPEC_VCVT_SI_S_N
-+  UNSPEC_VCVT_SI_U_N
-+  UNSPEC_VCVTH_S
-+  UNSPEC_VCVTH_U
-+  UNSPEC_VCVTA_S
-+  UNSPEC_VCVTA_U
-+  UNSPEC_VCVTM_S
-+  UNSPEC_VCVTM_U
-+  UNSPEC_VCVTN_S
-+  UNSPEC_VCVTN_U
-+  UNSPEC_VCVTP_S
-+  UNSPEC_VCVTP_U
-   UNSPEC_VEXT
-   UNSPEC_VHADD_S
-   UNSPEC_VHADD_U
-@@ -244,6 +262,8 @@
-   UNSPEC_VMLSL_S_LANE
-   UNSPEC_VMLSL_U_LANE
-   UNSPEC_VMLSL_LANE
-+  UNSPEC_VFMA_LANE
-+  UNSPEC_VFMS_LANE
-   UNSPEC_VMOVL_S
-   UNSPEC_VMOVL_U
-   UNSPEC_VMOVN
-@@ -365,5 +385,11 @@
-   UNSPEC_NVRINTN
-   UNSPEC_VQRDMLAH
-   UNSPEC_VQRDMLSH
-+  UNSPEC_VRND
-+  UNSPEC_VRNDA
-+  UNSPEC_VRNDI
-+  UNSPEC_VRNDM
-+  UNSPEC_VRNDN
-+  UNSPEC_VRNDP
-+  UNSPEC_VRNDX
- ])
--
---- a/src/gcc/config/arm/vec-common.md
-+++ b/src/gcc/config/arm/vec-common.md
-@@ -124,6 +124,20 @@
-     FAIL;
- })
- 
-+(define_expand "vec_perm_const<mode>"
-+  [(match_operand:VH 0 "s_register_operand")
-+   (match_operand:VH 1 "s_register_operand")
-+   (match_operand:VH 2 "s_register_operand")
-+   (match_operand:<V_cmp_result> 3)]
-+  "TARGET_NEON"
-+{
-+  if (arm_expand_vec_perm_const (operands[0], operands[1],
-+				 operands[2], operands[3]))
-+    DONE;
-+  else
-+    FAIL;
-+})
-+
- (define_expand "vec_perm<mode>"
-   [(match_operand:VE 0 "s_register_operand" "")
-    (match_operand:VE 1 "s_register_operand" "")
---- a/src/gcc/config/arm/vfp.md
-+++ b/src/gcc/config/arm/vfp.md
-@@ -18,13 +18,206 @@
- ;; along with GCC; see the file COPYING3.  If not see
- ;; <http://www.gnu.org/licenses/>.  */
- 
-+;; Patterns for HI moves which provide more data transfer instructions when VFP
-+;; support is enabled.
-+(define_insn "*arm_movhi_vfp"
-+ [(set
-+   (match_operand:HI 0 "nonimmediate_operand"
-+    "=rk,  r, r, m, r, *t,  r, *t")
-+   (match_operand:HI 1 "general_operand"
-+    "rIk, K, n, r, mi, r, *t, *t"))]
-+ "TARGET_ARM && TARGET_HARD_FLOAT
-+  && !TARGET_VFP_FP16INST
-+  && (register_operand (operands[0], HImode)
-+       || register_operand (operands[1], HImode))"
-+{
-+  switch (which_alternative)
-+    {
-+    case 0:
-+      return "mov%?\t%0, %1\t%@ movhi";
-+    case 1:
-+      return "mvn%?\t%0, #%B1\t%@ movhi";
-+    case 2:
-+      return "movw%?\t%0, %L1\t%@ movhi";
-+    case 3:
-+      return "strh%?\t%1, %0\t%@ movhi";
-+    case 4:
-+      return "ldrh%?\t%0, %1\t%@ movhi";
-+    case 5:
-+    case 6:
-+      return "vmov%?\t%0, %1\t%@ int";
-+    case 7:
-+      return "vmov%?.f32\t%0, %1\t%@ int";
-+    default:
-+      gcc_unreachable ();
-+    }
-+}
-+ [(set_attr "predicable" "yes")
-+  (set_attr_alternative "type"
-+   [(if_then_else
-+     (match_operand 1 "const_int_operand" "")
-+     (const_string "mov_imm")
-+     (const_string "mov_reg"))
-+    (const_string "mvn_imm")
-+    (const_string "mov_imm")
-+    (const_string "store1")
-+    (const_string "load1")
-+    (const_string "f_mcr")
-+    (const_string "f_mrc")
-+    (const_string "fmov")])
-+  (set_attr "arch" "*, *, v6t2, *, *, *, *, *")
-+  (set_attr "pool_range" "*, *, *, *, 256, *, *, *")
-+  (set_attr "neg_pool_range" "*, *, *, *, 244, *, *, *")
-+  (set_attr "length" "4")]
-+)
-+
-+(define_insn "*thumb2_movhi_vfp"
-+ [(set
-+   (match_operand:HI 0 "nonimmediate_operand"
-+    "=rk, r, l, r, m, r, *t, r, *t")
-+   (match_operand:HI 1 "general_operand"
-+    "rk, I, Py, n, r, m, r, *t, *t"))]
-+ "TARGET_THUMB2 && TARGET_HARD_FLOAT
-+  && !TARGET_VFP_FP16INST
-+  && (register_operand (operands[0], HImode)
-+       || register_operand (operands[1], HImode))"
-+{
-+  switch (which_alternative)
-+    {
-+    case 0:
-+    case 1:
-+    case 2:
-+      return "mov%?\t%0, %1\t%@ movhi";
-+    case 3:
-+      return "movw%?\t%0, %L1\t%@ movhi";
-+    case 4:
-+      return "strh%?\t%1, %0\t%@ movhi";
-+    case 5:
-+      return "ldrh%?\t%0, %1\t%@ movhi";
-+    case 6:
-+    case 7:
-+      return "vmov%?\t%0, %1\t%@ int";
-+    case 8:
-+      return "vmov%?.f32\t%0, %1\t%@ int";
-+    default:
-+      gcc_unreachable ();
-+    }
-+}
-+ [(set_attr "predicable" "yes")
-+  (set_attr "predicable_short_it"
-+   "yes, no, yes, no, no, no, no, no, no")
-+  (set_attr "type"
-+   "mov_reg, mov_imm, mov_imm, mov_imm, store1, load1,\
-+    f_mcr, f_mrc, fmov")
-+  (set_attr "arch" "*, *, *, v6t2, *, *, *, *, *")
-+  (set_attr "pool_range" "*, *, *, *, *, 4094, *, *, *")
-+  (set_attr "neg_pool_range" "*, *, *, *, *, 250, *, *, *")
-+  (set_attr "length" "2, 4, 2, 4, 4, 4, 4, 4, 4")]
-+)
-+
-+;; Patterns for HI moves which provide more data transfer instructions when FP16
-+;; instructions are available.
-+(define_insn "*arm_movhi_fp16"
-+ [(set
-+   (match_operand:HI 0 "nonimmediate_operand"
-+    "=r,  r, r, m, r, *t,  r, *t")
-+   (match_operand:HI 1 "general_operand"
-+    "rIk, K, n, r, mi, r, *t, *t"))]
-+ "TARGET_ARM && TARGET_VFP_FP16INST
-+  && (register_operand (operands[0], HImode)
-+       || register_operand (operands[1], HImode))"
-+{
-+  switch (which_alternative)
-+    {
-+    case 0:
-+      return "mov%?\t%0, %1\t%@ movhi";
-+    case 1:
-+      return "mvn%?\t%0, #%B1\t%@ movhi";
-+    case 2:
-+      return "movw%?\t%0, %L1\t%@ movhi";
-+    case 3:
-+      return "strh%?\t%1, %0\t%@ movhi";
-+    case 4:
-+      return "ldrh%?\t%0, %1\t%@ movhi";
-+    case 5:
-+    case 6:
-+      return "vmov.f16\t%0, %1\t%@ int";
-+    case 7:
-+      return "vmov%?.f32\t%0, %1\t%@ int";
-+    default:
-+      gcc_unreachable ();
-+    }
-+}
-+ [(set_attr "predicable" "yes, yes, yes, yes, yes, no, no, yes")
-+  (set_attr_alternative "type"
-+   [(if_then_else
-+     (match_operand 1 "const_int_operand" "")
-+     (const_string "mov_imm")
-+     (const_string "mov_reg"))
-+    (const_string "mvn_imm")
-+    (const_string "mov_imm")
-+    (const_string "store1")
-+    (const_string "load1")
-+    (const_string "f_mcr")
-+    (const_string "f_mrc")
-+    (const_string "fmov")])
-+  (set_attr "arch" "*, *, v6t2, *, *, *, *, *")
-+  (set_attr "pool_range" "*, *, *, *, 256, *, *, *")
-+  (set_attr "neg_pool_range" "*, *, *, *, 244, *, *, *")
-+  (set_attr "length" "4")]
-+)
-+
-+(define_insn "*thumb2_movhi_fp16"
-+ [(set
-+   (match_operand:HI 0 "nonimmediate_operand"
-+    "=rk, r, l, r, m, r, *t, r, *t")
-+   (match_operand:HI 1 "general_operand"
-+    "rk, I, Py, n, r, m, r, *t, *t"))]
-+ "TARGET_THUMB2 && TARGET_VFP_FP16INST
-+  && (register_operand (operands[0], HImode)
-+       || register_operand (operands[1], HImode))"
-+{
-+  switch (which_alternative)
-+    {
-+    case 0:
-+    case 1:
-+    case 2:
-+      return "mov%?\t%0, %1\t%@ movhi";
-+    case 3:
-+      return "movw%?\t%0, %L1\t%@ movhi";
-+    case 4:
-+      return "strh%?\t%1, %0\t%@ movhi";
-+    case 5:
-+      return "ldrh%?\t%0, %1\t%@ movhi";
-+    case 6:
-+    case 7:
-+      return "vmov.f16\t%0, %1\t%@ int";
-+    case 8:
-+      return "vmov%?.f32\t%0, %1\t%@ int";
-+    default:
-+      gcc_unreachable ();
-+    }
-+}
-+ [(set_attr "predicable"
-+   "yes, yes, yes, yes, yes, yes, no, no, yes")
-+  (set_attr "predicable_short_it"
-+   "yes, no, yes, no, no, no, no, no, no")
-+  (set_attr "type"
-+   "mov_reg, mov_imm, mov_imm, mov_imm, store1, load1,\
-+    f_mcr, f_mrc, fmov")
-+  (set_attr "arch" "*, *, *, v6t2, *, *, *, *, *")
-+  (set_attr "pool_range" "*, *, *, *, *, 4094, *, *, *")
-+  (set_attr "neg_pool_range" "*, *, *, *, *, 250, *, *, *")
-+  (set_attr "length" "2, 4, 2, 4, 4, 4, 4, 4, 4")]
-+)
-+
- ;; SImode moves
- ;; ??? For now do not allow loading constants into vfp regs.  This causes
- ;; problems because small constants get converted into adds.
- (define_insn "*arm_movsi_vfp"
-   [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,r,r,rk,m ,*t,r,*t,*t, *Uv")
-       (match_operand:SI 1 "general_operand"	   "rk, I,K,j,mi,rk,r,*t,*t,*Uvi,*t"))]
--  "TARGET_ARM && TARGET_VFP && TARGET_HARD_FLOAT
-+  "TARGET_ARM && TARGET_HARD_FLOAT
-    && (   s_register_operand (operands[0], SImode)
-        || s_register_operand (operands[1], SImode))"
-   "*
-@@ -53,7 +246,8 @@
-     }
-   "
-   [(set_attr "predicable" "yes")
--   (set_attr "type" "mov_reg,mov_reg,mvn_imm,mov_imm,load1,store1,f_mcr,f_mrc,fmov,f_loads,f_stores")
-+   (set_attr "type" "mov_reg,mov_reg,mvn_imm,mov_imm,load1,store1,
-+		     f_mcr,f_mrc,fmov,f_loads,f_stores")
-    (set_attr "pool_range"     "*,*,*,*,4096,*,*,*,*,1020,*")
-    (set_attr "neg_pool_range" "*,*,*,*,4084,*,*,*,*,1008,*")]
- )
-@@ -66,7 +260,7 @@
- (define_insn "*thumb2_movsi_vfp"
-   [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,l,r,r, l,*hk,m, *m,*t, r,*t,*t,  *Uv")
- 	(match_operand:SI 1 "general_operand"	   "rk,I,Py,K,j,mi,*mi,l,*hk, r,*t,*t,*Uvi,*t"))]
--  "TARGET_THUMB2 && TARGET_VFP && TARGET_HARD_FLOAT
-+  "TARGET_THUMB2 && TARGET_HARD_FLOAT
-    && (   s_register_operand (operands[0], SImode)
-        || s_register_operand (operands[1], SImode))"
-   "*
-@@ -112,7 +306,7 @@
- (define_insn "*movdi_vfp"
-   [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r,r,r,r,q,q,m,w,r,w,w, Uv")
-        (match_operand:DI 1 "di_operand"              "r,rDa,Db,Dc,mi,mi,q,r,w,w,Uvi,w"))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP && arm_tune != cortexa8
-+  "TARGET_32BIT && TARGET_HARD_FLOAT && arm_tune != cortexa8
-    && (   register_operand (operands[0], DImode)
-        || register_operand (operands[1], DImode))
-    && !(TARGET_NEON && CONST_INT_P (operands[1])
-@@ -163,7 +357,7 @@
- (define_insn "*movdi_vfp_cortexa8"
-   [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r,r,r,r,r,r,m,w,!r,w,w, Uv")
-        (match_operand:DI 1 "di_operand"              "r,rDa,Db,Dc,mi,mi,r,r,w,w,Uvi,w"))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP && arm_tune == cortexa8
-+  "TARGET_32BIT && TARGET_HARD_FLOAT && arm_tune == cortexa8
-     && (   register_operand (operands[0], DImode)
-         || register_operand (operands[1], DImode))
-     && !(TARGET_NEON && CONST_INT_P (operands[1])
-@@ -211,10 +405,87 @@
-  )
- 
- ;; HFmode moves
-+
-+(define_insn "*movhf_vfp_fp16"
-+  [(set (match_operand:HF 0 "nonimmediate_operand"
-+			  "= r,m,t,r,t,r,t,t,Um,r")
-+	(match_operand:HF 1 "general_operand"
-+			  "  m,r,t,r,r,t,Dv,Um,t,F"))]
-+  "TARGET_32BIT
-+   && TARGET_VFP_FP16INST
-+   && (s_register_operand (operands[0], HFmode)
-+       || s_register_operand (operands[1], HFmode))"
-+ {
-+  switch (which_alternative)
-+    {
-+    case 0: /* ARM register from memory.  */
-+      return \"ldrh%?\\t%0, %1\\t%@ __fp16\";
-+    case 1: /* Memory from ARM register.  */
-+      return \"strh%?\\t%1, %0\\t%@ __fp16\";
-+    case 2: /* S register from S register.  */
-+      return \"vmov\\t%0, %1\t%@ __fp16\";
-+    case 3: /* ARM register from ARM register.  */
-+      return \"mov%?\\t%0, %1\\t%@ __fp16\";
-+    case 4: /* S register from ARM register.  */
-+    case 5: /* ARM register from S register.  */
-+    case 6: /* S register from immediate.  */
-+      return \"vmov.f16\\t%0, %1\t%@ __fp16\";
-+    case 7: /* S register from memory.  */
-+      return \"vld1.16\\t{%z0}, %A1\";
-+    case 8: /* Memory from S register.  */
-+      return \"vst1.16\\t{%z1}, %A0\";
-+    case 9: /* ARM register from constant.  */
-+      {
-+	long bits;
-+	rtx ops[4];
-+
-+	bits = real_to_target (NULL, CONST_DOUBLE_REAL_VALUE (operands[1]),
-+			       HFmode);
-+	ops[0] = operands[0];
-+	ops[1] = GEN_INT (bits);
-+	ops[2] = GEN_INT (bits & 0xff00);
-+	ops[3] = GEN_INT (bits & 0x00ff);
-+
-+	if (arm_arch_thumb2)
-+	  output_asm_insn (\"movw\\t%0, %1\", ops);
-+	else
-+	  output_asm_insn (\"mov\\t%0, %2\;orr\\t%0, %0, %3\", ops);
-+	return \"\";
-+       }
-+    default:
-+      gcc_unreachable ();
-+    }
-+ }
-+  [(set_attr "predicable" "yes, yes, no, yes, no, no, no, no, no, no")
-+   (set_attr "predicable_short_it" "no, no, no, yes,\
-+				    no, no, no, no,\
-+				    no, no")
-+   (set_attr_alternative "type"
-+    [(const_string "load1") (const_string "store1")
-+     (const_string "fmov") (const_string "mov_reg")
-+     (const_string "f_mcr") (const_string "f_mrc")
-+     (const_string "fconsts") (const_string "neon_load1_1reg")
-+     (const_string "neon_store1_1reg")
-+     (if_then_else (match_test "arm_arch_thumb2")
-+      (const_string "mov_imm")
-+      (const_string "multiple"))])
-+   (set_attr_alternative "length"
-+    [(const_int 4) (const_int 4)
-+     (const_int 4) (const_int 4)
-+     (const_int 4) (const_int 4)
-+     (const_int 4) (const_int 4)
-+     (const_int 4)
-+     (if_then_else (match_test "arm_arch_thumb2")
-+      (const_int 4)
-+      (const_int 8))])]
-+)
-+
- (define_insn "*movhf_vfp_neon"
-   [(set (match_operand:HF 0 "nonimmediate_operand" "= t,Um,r,m,t,r,t,r,r")
- 	(match_operand:HF 1 "general_operand"	   " Um, t,m,r,t,r,r,t,F"))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_NEON_FP16
-+  "TARGET_32BIT
-+   && TARGET_HARD_FLOAT && TARGET_NEON_FP16
-+   && !TARGET_VFP_FP16INST
-    && (   s_register_operand (operands[0], HFmode)
-        || s_register_operand (operands[1], HFmode))"
-   "*
-@@ -268,7 +539,10 @@
- (define_insn "*movhf_vfp"
-   [(set (match_operand:HF 0 "nonimmediate_operand" "=r,m,t,r,t,r,r")
- 	(match_operand:HF 1 "general_operand"	   " m,r,t,r,r,t,F"))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FP16 && !TARGET_NEON_FP16
-+  "TARGET_32BIT
-+   && TARGET_HARD_FLOAT
-+   && !TARGET_NEON_FP16
-+   && !TARGET_VFP_FP16INST
-    && (   s_register_operand (operands[0], HFmode)
-        || s_register_operand (operands[1], HFmode))"
-   "*
-@@ -321,7 +595,7 @@
- (define_insn "*movsf_vfp"
-   [(set (match_operand:SF 0 "nonimmediate_operand" "=t,?r,t ,t  ,Uv,r ,m,t,r")
-         (match_operand:SF 1 "general_operand"	   " ?r,t,Dv,UvE,t, mE,r,t,r"))]
--  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP
-+  "TARGET_ARM && TARGET_HARD_FLOAT
-    && (   s_register_operand (operands[0], SFmode)
-        || s_register_operand (operands[1], SFmode))"
-   "*
-@@ -357,7 +631,7 @@
- (define_insn "*thumb2_movsf_vfp"
-   [(set (match_operand:SF 0 "nonimmediate_operand" "=t,?r,t, t  ,Uv,r ,m,t,r")
- 	(match_operand:SF 1 "general_operand"	   " ?r,t,Dv,UvE,t, mE,r,t,r"))]
--  "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_VFP
-+  "TARGET_THUMB2 && TARGET_HARD_FLOAT
-    && (   s_register_operand (operands[0], SFmode)
-        || s_register_operand (operands[1], SFmode))"
-   "*
-@@ -394,9 +668,9 @@
- ;; DFmode moves
- 
- (define_insn "*movdf_vfp"
--  [(set (match_operand:DF 0 "nonimmediate_soft_df_operand" "=w,?r,w ,w  ,Uv,r, m,w,r")
--	(match_operand:DF 1 "soft_df_operand"		   " ?r,w,Dy,UvF,w ,mF,r,w,r"))]
--  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP
-+  [(set (match_operand:DF 0 "nonimmediate_soft_df_operand" "=w,?r,w ,w,w  ,Uv,r, m,w,r")
-+	(match_operand:DF 1 "soft_df_operand"		   " ?r,w,Dy,G,UvF,w ,mF,r,w,r"))]
-+  "TARGET_ARM && TARGET_HARD_FLOAT
-    && (   register_operand (operands[0], DFmode)
-        || register_operand (operands[1], DFmode))"
-   "*
-@@ -410,40 +684,44 @@
-       case 2:
- 	gcc_assert (TARGET_VFP_DOUBLE);
-         return \"vmov%?.f64\\t%P0, %1\";
--      case 3: case 4:
-+      case 3:
-+	gcc_assert (TARGET_VFP_DOUBLE);
-+	return \"vmov.i64\\t%P0, #0\\t%@ float\";
-+      case 4: case 5:
- 	return output_move_vfp (operands);
--      case 5: case 6:
-+      case 6: case 7:
- 	return output_move_double (operands, true, NULL);
--      case 7:
-+      case 8:
- 	if (TARGET_VFP_SINGLE)
- 	  return \"vmov%?.f32\\t%0, %1\;vmov%?.f32\\t%p0, %p1\";
- 	else
- 	  return \"vmov%?.f64\\t%P0, %P1\";
--      case 8:
-+      case 9:
-         return \"#\";
-       default:
- 	gcc_unreachable ();
-       }
-     }
-   "
--  [(set_attr "type" "f_mcrr,f_mrrc,fconstd,f_loadd,f_stored,\
-+  [(set_attr "type" "f_mcrr,f_mrrc,fconstd,neon_move,f_loadd,f_stored,\
-                      load2,store2,ffarithd,multiple")
--   (set (attr "length") (cond [(eq_attr "alternative" "5,6,8") (const_int 8)
--			       (eq_attr "alternative" "7")
-+   (set (attr "length") (cond [(eq_attr "alternative" "6,7,9") (const_int 8)
-+			       (eq_attr "alternative" "8")
- 				(if_then_else
- 				 (match_test "TARGET_VFP_SINGLE")
- 				 (const_int 8)
- 				 (const_int 4))]
- 			      (const_int 4)))
--   (set_attr "predicable" "yes")
--   (set_attr "pool_range" "*,*,*,1020,*,1020,*,*,*")
--   (set_attr "neg_pool_range" "*,*,*,1004,*,1004,*,*,*")]
-+   (set_attr "predicable" "yes,yes,yes,no,yes,yes,yes,yes,yes,yes")
-+   (set_attr "pool_range" "*,*,*,*,1020,*,1020,*,*,*")
-+   (set_attr "neg_pool_range" "*,*,*,*,1004,*,1004,*,*,*")
-+   (set_attr "arch" "any,any,any,neon,any,any,any,any,any,any")]
- )
- 
- (define_insn "*thumb2_movdf_vfp"
--  [(set (match_operand:DF 0 "nonimmediate_soft_df_operand" "=w,?r,w ,w  ,Uv,r ,m,w,r")
--	(match_operand:DF 1 "soft_df_operand"		   " ?r,w,Dy,UvF,w, mF,r, w,r"))]
--  "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_VFP
-+  [(set (match_operand:DF 0 "nonimmediate_soft_df_operand" "=w,?r,w ,w,w  ,Uv,r ,m,w,r")
-+	(match_operand:DF 1 "soft_df_operand"		   " ?r,w,Dy,G,UvF,w, mF,r, w,r"))]
-+  "TARGET_THUMB2 && TARGET_HARD_FLOAT
-    && (   register_operand (operands[0], DFmode)
-        || register_operand (operands[1], DFmode))"
-   "*
-@@ -457,11 +735,14 @@
-       case 2:
- 	gcc_assert (TARGET_VFP_DOUBLE);
- 	return \"vmov%?.f64\\t%P0, %1\";
--      case 3: case 4:
-+      case 3:
-+	gcc_assert (TARGET_VFP_DOUBLE);
-+	return \"vmov.i64\\t%P0, #0\\t%@ float\";
-+      case 4: case 5:
- 	return output_move_vfp (operands);
--      case 5: case 6: case 8:
-+      case 6: case 7: case 9:
- 	return output_move_double (operands, true, NULL);
--      case 7:
-+      case 8:
- 	if (TARGET_VFP_SINGLE)
- 	  return \"vmov%?.f32\\t%0, %1\;vmov%?.f32\\t%p0, %p1\";
- 	else
-@@ -471,17 +752,18 @@
-       }
-     }
-   "
--  [(set_attr "type" "f_mcrr,f_mrrc,fconstd,f_loadd,\
-+  [(set_attr "type" "f_mcrr,f_mrrc,fconstd,neon_move,f_loadd,\
-                      f_stored,load2,store2,ffarithd,multiple")
--   (set (attr "length") (cond [(eq_attr "alternative" "5,6,8") (const_int 8)
--			       (eq_attr "alternative" "7")
-+   (set (attr "length") (cond [(eq_attr "alternative" "6,7,9") (const_int 8)
-+			       (eq_attr "alternative" "8")
- 				(if_then_else
- 				 (match_test "TARGET_VFP_SINGLE")
- 				 (const_int 8)
- 				 (const_int 4))]
- 			      (const_int 4)))
--   (set_attr "pool_range" "*,*,*,1018,*,4094,*,*,*")
--   (set_attr "neg_pool_range" "*,*,*,1008,*,0,*,*,*")]
-+   (set_attr "pool_range" "*,*,*,*,1018,*,4094,*,*,*")
-+   (set_attr "neg_pool_range" "*,*,*,*,1008,*,0,*,*,*")
-+   (set_attr "arch" "any,any,any,neon,any,any,any,any,any,any")]
- )
- 
- 
-@@ -494,7 +776,7 @@
- 	    [(match_operand 4 "cc_register" "") (const_int 0)])
- 	  (match_operand:SF 1 "s_register_operand" "0,t,t,0,?r,?r,0,t,t")
- 	  (match_operand:SF 2 "s_register_operand" "t,0,t,?r,0,?r,t,0,t")))]
--  "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_ARM && TARGET_HARD_FLOAT"
-   "@
-    vmov%D3.f32\\t%0, %2
-    vmov%d3.f32\\t%0, %1
-@@ -517,7 +799,7 @@
- 	    [(match_operand 4 "cc_register" "") (const_int 0)])
- 	  (match_operand:SF 1 "s_register_operand" "0,t,t,0,?r,?r,0,t,t")
- 	  (match_operand:SF 2 "s_register_operand" "t,0,t,?r,0,?r,t,0,t")))]
--  "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_VFP && !arm_restrict_it"
-+  "TARGET_THUMB2 && TARGET_HARD_FLOAT && !arm_restrict_it"
-   "@
-    it\\t%D3\;vmov%D3.f32\\t%0, %2
-    it\\t%d3\;vmov%d3.f32\\t%0, %1
-@@ -585,7 +867,7 @@
- (define_insn "*abssf2_vfp"
-   [(set (match_operand:SF	  0 "s_register_operand" "=t")
- 	(abs:SF (match_operand:SF 1 "s_register_operand" "t")))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "vabs%?.f32\\t%0, %1"
-   [(set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")
-@@ -605,7 +887,7 @@
- (define_insn "*negsf2_vfp"
-   [(set (match_operand:SF	  0 "s_register_operand" "=t,?r")
- 	(neg:SF (match_operand:SF 1 "s_register_operand" "t,r")))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "@
-    vneg%?.f32\\t%0, %1
-    eor%?\\t%0, %1, #-2147483648"
-@@ -661,14 +943,68 @@
-    (set_attr "type" "ffarithd")]
- )
- 
-+;; ABS and NEG for FP16.
-+(define_insn "<absneg_str>hf2"
-+  [(set (match_operand:HF 0 "s_register_operand" "=w")
-+    (ABSNEG:HF (match_operand:HF 1 "s_register_operand" "w")))]
-+ "TARGET_VFP_FP16INST"
-+ "v<absneg_str>.f16\t%0, %1"
-+  [(set_attr "conds" "unconditional")
-+   (set_attr "type" "ffariths")]
-+)
-+
-+(define_expand "neon_vabshf"
-+ [(set
-+   (match_operand:HF 0 "s_register_operand")
-+   (abs:HF (match_operand:HF 1 "s_register_operand")))]
-+ "TARGET_VFP_FP16INST"
-+{
-+  emit_insn (gen_abshf2 (operands[0], operands[1]));
-+  DONE;
-+})
-+
-+;; VRND for FP16.
-+(define_insn "neon_v<fp16_rnd_str>hf"
-+  [(set (match_operand:HF 0 "s_register_operand" "=w")
-+    (unspec:HF
-+     [(match_operand:HF 1 "s_register_operand" "w")]
-+     FP16_RND))]
-+ "TARGET_VFP_FP16INST"
-+ "<fp16_rnd_insn>.f16\t%0, %1"
-+ [(set_attr "conds" "unconditional")
-+  (set_attr "type" "neon_fp_round_s")]
-+)
-+
-+(define_insn "neon_vrndihf"
-+  [(set (match_operand:HF 0 "s_register_operand" "=w")
-+    (unspec:HF
-+     [(match_operand:HF 1 "s_register_operand" "w")]
-+     UNSPEC_VRNDI))]
-+  "TARGET_VFP_FP16INST"
-+  "vrintr.f16\t%0, %1"
-+ [(set_attr "conds" "unconditional")
-+  (set_attr "type" "neon_fp_round_s")]
-+)
- 
- ;; Arithmetic insns
- 
-+(define_insn "addhf3"
-+  [(set
-+    (match_operand:HF 0 "s_register_operand" "=w")
-+    (plus:HF
-+     (match_operand:HF 1 "s_register_operand" "w")
-+     (match_operand:HF 2 "s_register_operand" "w")))]
-+ "TARGET_VFP_FP16INST"
-+ "vadd.f16\t%0, %1, %2"
-+  [(set_attr "conds" "unconditional")
-+   (set_attr "type" "fadds")]
-+)
-+
- (define_insn "*addsf3_vfp"
-   [(set (match_operand:SF	   0 "s_register_operand" "=t")
- 	(plus:SF (match_operand:SF 1 "s_register_operand" "t")
- 		 (match_operand:SF 2 "s_register_operand" "t")))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "vadd%?.f32\\t%0, %1, %2"
-   [(set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")
-@@ -686,12 +1022,23 @@
-    (set_attr "type" "faddd")]
- )
- 
-+(define_insn "subhf3"
-+ [(set
-+   (match_operand:HF 0 "s_register_operand" "=w")
-+   (minus:HF
-+    (match_operand:HF 1 "s_register_operand" "w")
-+    (match_operand:HF 2 "s_register_operand" "w")))]
-+ "TARGET_VFP_FP16INST"
-+ "vsub.f16\t%0, %1, %2"
-+  [(set_attr "conds" "unconditional")
-+   (set_attr "type" "fadds")]
-+)
- 
- (define_insn "*subsf3_vfp"
-   [(set (match_operand:SF	    0 "s_register_operand" "=t")
- 	(minus:SF (match_operand:SF 1 "s_register_operand" "t")
- 		  (match_operand:SF 2 "s_register_operand" "t")))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "vsub%?.f32\\t%0, %1, %2"
-   [(set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")
-@@ -712,6 +1059,19 @@
- 
- ;; Division insns
- 
-+;; FP16 Division.
-+(define_insn "divhf3"
-+  [(set
-+    (match_operand:HF	   0 "s_register_operand" "=w")
-+    (div:HF
-+     (match_operand:HF 1 "s_register_operand" "w")
-+     (match_operand:HF 2 "s_register_operand" "w")))]
-+  "TARGET_VFP_FP16INST"
-+  "vdiv.f16\t%0, %1, %2"
-+  [(set_attr "conds" "unconditional")
-+   (set_attr "type" "fdivs")]
-+)
-+
- ; VFP9 Erratum 760019: It's potentially unsafe to overwrite the input
- ; operands, so mark the output as early clobber for VFPv2 on ARMv5 or
- ; earlier.
-@@ -719,7 +1079,7 @@
-   [(set (match_operand:SF	  0 "s_register_operand" "=&t,t")
- 	(div:SF (match_operand:SF 1 "s_register_operand" "t,t")
- 		(match_operand:SF 2 "s_register_operand" "t,t")))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "vdiv%?.f32\\t%0, %1, %2"
-   [(set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")
-@@ -742,11 +1102,22 @@
- 
- ;; Multiplication insns
- 
-+(define_insn "mulhf3"
-+ [(set
-+   (match_operand:HF 0 "s_register_operand" "=w")
-+   (mult:HF (match_operand:HF 1 "s_register_operand" "w")
-+	    (match_operand:HF 2 "s_register_operand" "w")))]
-+  "TARGET_VFP_FP16INST"
-+  "vmul.f16\t%0, %1, %2"
-+  [(set_attr "conds" "unconditional")
-+   (set_attr "type" "fmuls")]
-+)
-+
- (define_insn "*mulsf3_vfp"
-   [(set (match_operand:SF	   0 "s_register_operand" "=t")
- 	(mult:SF (match_operand:SF 1 "s_register_operand" "t")
- 		 (match_operand:SF 2 "s_register_operand" "t")))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "vmul%?.f32\\t%0, %1, %2"
-   [(set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")
-@@ -764,11 +1135,31 @@
-    (set_attr "type" "fmuld")]
- )
- 
-+(define_insn "*mulsf3neghf_vfp"
-+  [(set (match_operand:HF		   0 "s_register_operand" "=t")
-+	(mult:HF (neg:HF (match_operand:HF 1 "s_register_operand" "t"))
-+		 (match_operand:HF	   2 "s_register_operand" "t")))]
-+  "TARGET_VFP_FP16INST && !flag_rounding_math"
-+  "vnmul.f16\\t%0, %1, %2"
-+  [(set_attr "conds" "unconditional")
-+   (set_attr "type" "fmuls")]
-+)
-+
-+(define_insn "*negmulhf3_vfp"
-+  [(set (match_operand:HF		   0 "s_register_operand" "=t")
-+	(neg:HF (mult:HF (match_operand:HF 1 "s_register_operand" "t")
-+		 (match_operand:HF	   2 "s_register_operand" "t"))))]
-+  "TARGET_VFP_FP16INST"
-+  "vnmul.f16\\t%0, %1, %2"
-+  [(set_attr "conds" "unconditional")
-+   (set_attr "type" "fmuls")]
-+)
-+
- (define_insn "*mulsf3negsf_vfp"
-   [(set (match_operand:SF		   0 "s_register_operand" "=t")
- 	(mult:SF (neg:SF (match_operand:SF 1 "s_register_operand" "t"))
- 		 (match_operand:SF	   2 "s_register_operand" "t")))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP && !flag_rounding_math"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT && !flag_rounding_math"
-   "vnmul%?.f32\\t%0, %1, %2"
-   [(set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")
-@@ -779,7 +1170,7 @@
-   [(set (match_operand:SF		   0 "s_register_operand" "=t")
- 	(neg:SF (mult:SF (match_operand:SF 1 "s_register_operand" "t")
- 		 (match_operand:SF	   2 "s_register_operand" "t"))))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "vnmul%?.f32\\t%0, %1, %2"
-   [(set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")
-@@ -813,12 +1204,24 @@
- ;; Multiply-accumulate insns
- 
- ;; 0 = 1 * 2 + 0
-+(define_insn "*mulsf3addhf_vfp"
-+ [(set (match_operand:HF 0 "s_register_operand" "=t")
-+       (plus:HF
-+	(mult:HF (match_operand:HF 2 "s_register_operand" "t")
-+		 (match_operand:HF 3 "s_register_operand" "t"))
-+	(match_operand:HF 1 "s_register_operand" "0")))]
-+  "TARGET_VFP_FP16INST"
-+  "vmla.f16\\t%0, %2, %3"
-+  [(set_attr "conds" "unconditional")
-+   (set_attr "type" "fmacs")]
-+)
-+
- (define_insn "*mulsf3addsf_vfp"
-   [(set (match_operand:SF		    0 "s_register_operand" "=t")
- 	(plus:SF (mult:SF (match_operand:SF 2 "s_register_operand" "t")
- 			  (match_operand:SF 3 "s_register_operand" "t"))
- 		 (match_operand:SF	    1 "s_register_operand" "0")))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "vmla%?.f32\\t%0, %2, %3"
-   [(set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")
-@@ -838,12 +1241,23 @@
- )
- 
- ;; 0 = 1 * 2 - 0
-+(define_insn "*mulhf3subhf_vfp"
-+  [(set (match_operand:HF 0 "s_register_operand" "=t")
-+	(minus:HF (mult:HF (match_operand:HF 2 "s_register_operand" "t")
-+			   (match_operand:HF 3 "s_register_operand" "t"))
-+		  (match_operand:HF 1 "s_register_operand" "0")))]
-+  "TARGET_VFP_FP16INST"
-+  "vnmls.f16\\t%0, %2, %3"
-+  [(set_attr "conds" "unconditional")
-+   (set_attr "type" "fmacs")]
-+)
-+
- (define_insn "*mulsf3subsf_vfp"
-   [(set (match_operand:SF		     0 "s_register_operand" "=t")
- 	(minus:SF (mult:SF (match_operand:SF 2 "s_register_operand" "t")
- 			   (match_operand:SF 3 "s_register_operand" "t"))
- 		  (match_operand:SF	     1 "s_register_operand" "0")))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "vnmls%?.f32\\t%0, %2, %3"
-   [(set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")
-@@ -863,12 +1277,23 @@
- )
- 
- ;; 0 = -(1 * 2) + 0
-+(define_insn "*mulhf3neghfaddhf_vfp"
-+  [(set (match_operand:HF 0 "s_register_operand" "=t")
-+	(minus:HF (match_operand:HF 1 "s_register_operand" "0")
-+		  (mult:HF (match_operand:HF 2 "s_register_operand" "t")
-+			   (match_operand:HF 3 "s_register_operand" "t"))))]
-+  "TARGET_VFP_FP16INST"
-+  "vmls.f16\\t%0, %2, %3"
-+  [(set_attr "conds" "unconditional")
-+   (set_attr "type" "fmacs")]
-+)
-+
- (define_insn "*mulsf3negsfaddsf_vfp"
-   [(set (match_operand:SF		     0 "s_register_operand" "=t")
- 	(minus:SF (match_operand:SF	     1 "s_register_operand" "0")
- 		  (mult:SF (match_operand:SF 2 "s_register_operand" "t")
- 			   (match_operand:SF 3 "s_register_operand" "t"))))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "vmls%?.f32\\t%0, %2, %3"
-   [(set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")
-@@ -889,13 +1314,25 @@
- 
- 
- ;; 0 = -(1 * 2) - 0
-+(define_insn "*mulhf3neghfsubhf_vfp"
-+  [(set (match_operand:HF 0 "s_register_operand" "=t")
-+	(minus:HF (mult:HF
-+		   (neg:HF (match_operand:HF 2 "s_register_operand" "t"))
-+		   (match_operand:HF 3 "s_register_operand" "t"))
-+		  (match_operand:HF 1 "s_register_operand" "0")))]
-+  "TARGET_VFP_FP16INST"
-+  "vnmla.f16\\t%0, %2, %3"
-+  [(set_attr "conds" "unconditional")
-+   (set_attr "type" "fmacs")]
-+)
-+
- (define_insn "*mulsf3negsfsubsf_vfp"
-   [(set (match_operand:SF		      0 "s_register_operand" "=t")
- 	(minus:SF (mult:SF
- 		    (neg:SF (match_operand:SF 2 "s_register_operand" "t"))
- 		    (match_operand:SF	      3 "s_register_operand" "t"))
- 		  (match_operand:SF	      1 "s_register_operand" "0")))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "vnmla%?.f32\\t%0, %2, %3"
-   [(set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")
-@@ -917,6 +1354,30 @@
- 
- ;; Fused-multiply-accumulate
- 
-+(define_insn "fmahf4"
-+  [(set (match_operand:HF 0 "register_operand" "=w")
-+    (fma:HF
-+     (match_operand:HF 1 "register_operand" "w")
-+     (match_operand:HF 2 "register_operand" "w")
-+     (match_operand:HF 3 "register_operand" "0")))]
-+ "TARGET_VFP_FP16INST"
-+ "vfma.f16\\t%0, %1, %2"
-+ [(set_attr "conds" "unconditional")
-+  (set_attr "type" "ffmas")]
-+)
-+
-+(define_expand "neon_vfmahf"
-+  [(match_operand:HF 0 "s_register_operand")
-+   (match_operand:HF 1 "s_register_operand")
-+   (match_operand:HF 2 "s_register_operand")
-+   (match_operand:HF 3 "s_register_operand")]
-+  "TARGET_VFP_FP16INST"
-+{
-+  emit_insn (gen_fmahf4 (operands[0], operands[2], operands[3],
-+			 operands[1]));
-+  DONE;
-+})
-+
- (define_insn "fma<SDF:mode>4"
-   [(set (match_operand:SDF 0 "register_operand" "=<F_constraint>")
-         (fma:SDF (match_operand:SDF 1 "register_operand" "<F_constraint>")
-@@ -929,6 +1390,30 @@
-    (set_attr "type" "ffma<vfp_type>")]
- )
- 
-+(define_insn "fmsubhf4_fp16"
-+ [(set (match_operand:HF 0 "register_operand" "=w")
-+   (fma:HF
-+    (neg:HF (match_operand:HF 1 "register_operand" "w"))
-+    (match_operand:HF 2 "register_operand" "w")
-+    (match_operand:HF 3 "register_operand" "0")))]
-+ "TARGET_VFP_FP16INST"
-+ "vfms.f16\\t%0, %1, %2"
-+ [(set_attr "conds" "unconditional")
-+  (set_attr "type" "ffmas")]
-+)
-+
-+(define_expand "neon_vfmshf"
-+  [(match_operand:HF 0 "s_register_operand")
-+   (match_operand:HF 1 "s_register_operand")
-+   (match_operand:HF 2 "s_register_operand")
-+   (match_operand:HF 3 "s_register_operand")]
-+  "TARGET_VFP_FP16INST"
-+{
-+  emit_insn (gen_fmsubhf4_fp16 (operands[0], operands[2], operands[3],
-+				operands[1]));
-+  DONE;
-+})
-+
- (define_insn "*fmsub<SDF:mode>4"
-   [(set (match_operand:SDF 0 "register_operand" "=<F_constraint>")
- 	(fma:SDF (neg:SDF (match_operand:SDF 1 "register_operand"
-@@ -942,6 +1427,17 @@
-    (set_attr "type" "ffma<vfp_type>")]
- )
- 
-+(define_insn "*fnmsubhf4"
-+  [(set (match_operand:HF 0 "register_operand" "=w")
-+	(fma:HF (match_operand:HF 1 "register_operand" "w")
-+		 (match_operand:HF 2 "register_operand" "w")
-+		 (neg:HF (match_operand:HF 3 "register_operand" "0"))))]
-+  "TARGET_VFP_FP16INST"
-+  "vfnms.f16\\t%0, %1, %2"
-+  [(set_attr "conds" "unconditional")
-+   (set_attr "type" "ffmas")]
-+)
-+
- (define_insn "*fnmsub<SDF:mode>4"
-   [(set (match_operand:SDF 0 "register_operand" "=<F_constraint>")
- 	(fma:SDF (match_operand:SDF 1 "register_operand" "<F_constraint>")
-@@ -954,6 +1450,17 @@
-    (set_attr "type" "ffma<vfp_type>")]
- )
- 
-+(define_insn "*fnmaddhf4"
-+  [(set (match_operand:HF 0 "register_operand" "=w")
-+	(fma:HF (neg:HF (match_operand:HF 1 "register_operand" "w"))
-+		 (match_operand:HF 2 "register_operand" "w")
-+		 (neg:HF (match_operand:HF 3 "register_operand" "0"))))]
-+  "TARGET_VFP_FP16INST"
-+  "vfnma.f16\\t%0, %1, %2"
-+  [(set_attr "conds" "unconditional")
-+   (set_attr "type" "ffmas")]
-+)
-+
- (define_insn "*fnmadd<SDF:mode>4"
-   [(set (match_operand:SDF 0 "register_operand" "=<F_constraint>")
- 	(fma:SDF (neg:SDF (match_operand:SDF 1 "register_operand"
-@@ -993,7 +1500,7 @@
- (define_insn "extendhfsf2"
-   [(set (match_operand:SF		   0 "s_register_operand" "=t")
- 	(float_extend:SF (match_operand:HF 1 "s_register_operand" "t")))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FP16"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT && (TARGET_FP16 || TARGET_VFP_FP16INST)"
-   "vcvtb%?.f32.f16\\t%0, %1"
-   [(set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")
-@@ -1003,7 +1510,7 @@
- (define_insn "truncsfhf2"
-   [(set (match_operand:HF		   0 "s_register_operand" "=t")
- 	(float_truncate:HF (match_operand:SF 1 "s_register_operand" "t")))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FP16"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT && (TARGET_FP16 || TARGET_VFP_FP16INST)"
-   "vcvtb%?.f16.f32\\t%0, %1"
-   [(set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")
-@@ -1013,7 +1520,7 @@
- (define_insn "*truncsisf2_vfp"
-   [(set (match_operand:SI		  0 "s_register_operand" "=t")
- 	(fix:SI (fix:SF (match_operand:SF 1 "s_register_operand" "t"))))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "vcvt%?.s32.f32\\t%0, %1"
-   [(set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")
-@@ -1034,7 +1541,7 @@
- (define_insn "fixuns_truncsfsi2"
-   [(set (match_operand:SI		  0 "s_register_operand" "=t")
- 	(unsigned_fix:SI (fix:SF (match_operand:SF 1 "s_register_operand" "t"))))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "vcvt%?.u32.f32\\t%0, %1"
-   [(set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")
-@@ -1055,7 +1562,7 @@
- (define_insn "*floatsisf2_vfp"
-   [(set (match_operand:SF	    0 "s_register_operand" "=t")
- 	(float:SF (match_operand:SI 1 "s_register_operand" "t")))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "vcvt%?.f32.s32\\t%0, %1"
-   [(set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")
-@@ -1076,7 +1583,7 @@
- (define_insn "floatunssisf2"
-   [(set (match_operand:SF	    0 "s_register_operand" "=t")
- 	(unsigned_float:SF (match_operand:SI 1 "s_register_operand" "t")))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "vcvt%?.f32.u32\\t%0, %1"
-   [(set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")
-@@ -1096,13 +1603,34 @@
- 
- ;; Sqrt insns.
- 
-+(define_insn "neon_vsqrthf"
-+  [(set (match_operand:HF 0 "s_register_operand" "=w")
-+	(sqrt:HF (match_operand:HF 1 "s_register_operand" "w")))]
-+  "TARGET_VFP_FP16INST"
-+  "vsqrt.f16\t%0, %1"
-+  [(set_attr "conds" "unconditional")
-+   (set_attr "type" "fsqrts")]
-+)
-+
-+(define_insn "neon_vrsqrtshf"
-+  [(set
-+    (match_operand:HF 0 "s_register_operand" "=w")
-+    (unspec:HF [(match_operand:HF 1 "s_register_operand" "w")
-+		(match_operand:HF 2 "s_register_operand" "w")]
-+     UNSPEC_VRSQRTS))]
-+ "TARGET_VFP_FP16INST"
-+ "vrsqrts.f16\t%0, %1, %2"
-+ [(set_attr "conds" "unconditional")
-+  (set_attr "type" "fsqrts")]
-+)
-+
- ; VFP9 Erratum 760019: It's potentially unsafe to overwrite the input
- ; operands, so mark the output as early clobber for VFPv2 on ARMv5 or
- ; earlier.
- (define_insn "*sqrtsf2_vfp"
-   [(set (match_operand:SF	   0 "s_register_operand" "=&t,t")
- 	(sqrt:SF (match_operand:SF 1 "s_register_operand" "t,t")))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "vsqrt%?.f32\\t%0, %1"
-   [(set_attr "predicable" "yes")
-    (set_attr "predicable_short_it" "no")
-@@ -1127,7 +1655,7 @@
- (define_insn "*movcc_vfp"
-   [(set (reg CC_REGNUM)
- 	(reg VFPCC_REGNUM))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "vmrs%?\\tAPSR_nzcv, FPSCR"
-   [(set_attr "conds" "set")
-    (set_attr "type" "f_flag")]
-@@ -1137,9 +1665,9 @@
-   [(set (reg:CCFP CC_REGNUM)
- 	(compare:CCFP (match_operand:SF 0 "s_register_operand"  "t")
- 		      (match_operand:SF 1 "vfp_compare_operand" "tG")))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "#"
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   [(set (reg:CCFP VFPCC_REGNUM)
- 	(compare:CCFP (match_dup 0)
- 		      (match_dup 1)))
-@@ -1152,9 +1680,9 @@
-   [(set (reg:CCFPE CC_REGNUM)
- 	(compare:CCFPE (match_operand:SF 0 "s_register_operand"  "t")
- 		       (match_operand:SF 1 "vfp_compare_operand" "tG")))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "#"
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   [(set (reg:CCFPE VFPCC_REGNUM)
- 	(compare:CCFPE (match_dup 0)
- 		       (match_dup 1)))
-@@ -1203,7 +1731,7 @@
-   [(set (reg:CCFP VFPCC_REGNUM)
- 	(compare:CCFP (match_operand:SF 0 "s_register_operand"  "t,t")
- 		      (match_operand:SF 1 "vfp_compare_operand" "t,G")))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "@
-    vcmp%?.f32\\t%0, %1
-    vcmp%?.f32\\t%0, #0"
-@@ -1216,7 +1744,7 @@
-   [(set (reg:CCFPE VFPCC_REGNUM)
- 	(compare:CCFPE (match_operand:SF 0 "s_register_operand"  "t,t")
- 		       (match_operand:SF 1 "vfp_compare_operand" "t,G")))]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "@
-    vcmpe%?.f32\\t%0, %1
-    vcmpe%?.f32\\t%0, #0"
-@@ -1252,9 +1780,6 @@
- )
- 
- ;; Fixed point to floating point conversions.
--(define_code_iterator FCVT [unsigned_float float])
--(define_code_attr FCVTI32typename [(unsigned_float "u32") (float "s32")])
--
- (define_insn "*combine_vcvt_f32_<FCVTI32typename>"
-   [(set (match_operand:SF 0 "s_register_operand" "=t")
- 	(mult:SF (FCVT:SF (match_operand:SI 1 "s_register_operand" "0"))
-@@ -1299,13 +1824,132 @@
-    (set_attr "type" "f_cvtf2i")]
-  )
- 
-+;; FP16 conversions.
-+(define_insn "neon_vcvth<sup>hf"
-+ [(set (match_operand:HF 0 "s_register_operand" "=w")
-+   (unspec:HF
-+    [(match_operand:SI 1 "s_register_operand" "w")]
-+    VCVTH_US))]
-+ "TARGET_VFP_FP16INST"
-+ "vcvt.f16.<sup>%#32\t%0, %1"
-+ [(set_attr "conds" "unconditional")
-+  (set_attr "type" "f_cvti2f")]
-+)
-+
-+(define_insn "neon_vcvth<sup>si"
-+ [(set (match_operand:SI 0 "s_register_operand" "=w")
-+   (unspec:SI
-+    [(match_operand:HF 1 "s_register_operand" "w")]
-+    VCVTH_US))]
-+ "TARGET_VFP_FP16INST"
-+ "vcvt.<sup>%#32.f16\t%0, %1"
-+ [(set_attr "conds" "unconditional")
-+  (set_attr "type" "f_cvtf2i")]
-+)
-+
-+;; The neon_vcvth<sup>_nhf patterns are used to generate the instruction for the
-+;; vcvth_n_f16_<sup>32 arm_fp16 intrinsics.  They are complicated by the
-+;; hardware requirement that the source and destination registers are the same
-+;; despite having different machine modes.  The approach is to use a temporary
-+;; register for the conversion and move that to the correct destination.
-+
-+;; Generate an unspec pattern for the intrinsic.
-+(define_insn "neon_vcvth<sup>_nhf_unspec"
-+ [(set
-+   (match_operand:SI 0 "s_register_operand" "=w")
-+   (unspec:SI
-+    [(match_operand:SI 1 "s_register_operand" "0")
-+     (match_operand:SI 2 "immediate_operand" "i")]
-+    VCVT_HF_US_N))
-+ (set
-+  (match_operand:HF 3 "s_register_operand" "=w")
-+  (float_truncate:HF (float:SF (match_dup 0))))]
-+ "TARGET_VFP_FP16INST"
-+{
-+  neon_const_bounds (operands[2], 1, 33);
-+  return "vcvt.f16.<sup>32\t%0, %0, %2\;vmov.f32\t%3, %0";
-+}
-+  [(set_attr "conds" "unconditional")
-+   (set_attr "type" "f_cvti2f")]
-+)
-+
-+;; Generate the instruction patterns needed for vcvth_n_f16_s32 neon intrinsics.
-+(define_expand "neon_vcvth<sup>_nhf"
-+ [(match_operand:HF 0 "s_register_operand")
-+  (unspec:HF [(match_operand:SI 1 "s_register_operand")
-+	      (match_operand:SI 2 "immediate_operand")]
-+   VCVT_HF_US_N)]
-+"TARGET_VFP_FP16INST"
-+{
-+  rtx op1 = gen_reg_rtx (SImode);
-+
-+  neon_const_bounds (operands[2], 1, 33);
-+
-+  emit_move_insn (op1, operands[1]);
-+  emit_insn (gen_neon_vcvth<sup>_nhf_unspec (op1, op1, operands[2],
-+					     operands[0]));
-+  DONE;
-+})
-+
-+;; The neon_vcvth<sup>_nsi patterns are used to generate the instruction for the
-+;; vcvth_n_<sup>32_f16 arm_fp16 intrinsics.  They have the same restrictions and
-+;; are implemented in the same way as the neon_vcvth<sup>_nhf patterns.
-+
-+;; Generate an unspec pattern, constraining the registers.
-+(define_insn "neon_vcvth<sup>_nsi_unspec"
-+ [(set (match_operand:SI 0 "s_register_operand" "=w")
-+   (unspec:SI
-+    [(fix:SI
-+      (fix:SF
-+       (float_extend:SF
-+	(match_operand:HF 1 "s_register_operand" "w"))))
-+     (match_operand:SI 2 "immediate_operand" "i")]
-+    VCVT_SI_US_N))]
-+ "TARGET_VFP_FP16INST"
-+{
-+  neon_const_bounds (operands[2], 1, 33);
-+  return "vmov.f32\t%0, %1\;vcvt.<sup>%#32.f16\t%0, %0, %2";
-+}
-+  [(set_attr "conds" "unconditional")
-+   (set_attr "type" "f_cvtf2i")]
-+)
-+
-+;; Generate the instruction patterns needed for vcvth_n_f16_s32 neon intrinsics.
-+(define_expand "neon_vcvth<sup>_nsi"
-+ [(match_operand:SI 0 "s_register_operand")
-+  (unspec:SI
-+   [(match_operand:HF 1 "s_register_operand")
-+    (match_operand:SI 2 "immediate_operand")]
-+   VCVT_SI_US_N)]
-+ "TARGET_VFP_FP16INST"
-+{
-+  rtx op1 = gen_reg_rtx (SImode);
-+
-+  neon_const_bounds (operands[2], 1, 33);
-+  emit_insn (gen_neon_vcvth<sup>_nsi_unspec (op1, operands[1], operands[2]));
-+  emit_move_insn (operands[0], op1);
-+  DONE;
-+})
-+
-+(define_insn "neon_vcvt<vcvth_op>h<sup>si"
-+ [(set
-+   (match_operand:SI 0 "s_register_operand" "=w")
-+   (unspec:SI
-+    [(match_operand:HF 1 "s_register_operand" "w")]
-+    VCVT_HF_US))]
-+ "TARGET_VFP_FP16INST"
-+ "vcvt<vcvth_op>.<sup>%#32.f16\t%0, %1"
-+  [(set_attr "conds" "unconditional")
-+   (set_attr "type" "f_cvtf2i")]
-+)
-+
- ;; Store multiple insn used in function prologue.
- (define_insn "*push_multi_vfp"
-   [(match_parallel 2 "multi_register_push"
-     [(set (match_operand:BLK 0 "memory_operand" "=m")
- 	  (unspec:BLK [(match_operand:DF 1 "vfp_register_operand" "")]
- 		      UNSPEC_PUSH_MULT))])]
--  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
-+  "TARGET_32BIT && TARGET_HARD_FLOAT"
-   "* return vfp_output_vstmd (operands);"
-   [(set_attr "type" "f_stored")]
- )
-@@ -1368,6 +2012,20 @@
- )
- 
- ;; Scalar forms for the IEEE-754 fmax()/fmin() functions
-+
-+(define_insn "neon_<fmaxmin_op>hf"
-+ [(set
-+   (match_operand:HF 0 "s_register_operand" "=w")
-+   (unspec:HF
-+    [(match_operand:HF 1 "s_register_operand" "w")
-+     (match_operand:HF 2 "s_register_operand" "w")]
-+    VMAXMINFNM))]
-+ "TARGET_VFP_FP16INST"
-+ "<fmaxmin_op>.f16\t%0, %1, %2"
-+ [(set_attr "conds" "unconditional")
-+  (set_attr "type" "f_minmaxs")]
-+)
-+
- (define_insn "<fmaxmin><mode>3"
-   [(set (match_operand:SDF 0 "s_register_operand" "=<F_constraint>")
- 	(unspec:SDF [(match_operand:SDF 1 "s_register_operand" "<F_constraint>")
-@@ -1382,7 +2040,7 @@
- ;; Write Floating-point Status and Control Register.
- (define_insn "set_fpscr"
-   [(unspec_volatile [(match_operand:SI 0 "register_operand" "r")] VUNSPEC_SET_FPSCR)]
--  "TARGET_VFP && TARGET_HARD_FLOAT"
-+  "TARGET_HARD_FLOAT"
-   "mcr\\tp10, 7, %0, cr1, cr0, 0\\t @SET_FPSCR"
-   [(set_attr "type" "mrs")])
- 
-@@ -1390,7 +2048,7 @@
- (define_insn "get_fpscr"
-   [(set (match_operand:SI 0 "register_operand" "=r")
-         (unspec_volatile:SI [(const_int 0)] VUNSPEC_GET_FPSCR))]
--  "TARGET_VFP && TARGET_HARD_FLOAT"
-+  "TARGET_HARD_FLOAT"
-   "mrc\\tp10, 7, %0, cr1, cr0, 0\\t @GET_FPSCR"
-   [(set_attr "type" "mrs")])
- 
---- a/src/gcc/config/arm/xgene1.md
-+++ b/src/gcc/config/arm/xgene1.md
-@@ -164,7 +164,7 @@
- 
- (define_insn_reservation "xgene1_bfm" 2
-   (and (eq_attr "tune" "xgene1")
--       (eq_attr "type" "bfm"))
-+       (eq_attr "type" "bfm,bfx"))
-   "xgene1_decode1op,xgene1_fsu")
- 
- (define_insn_reservation "xgene1_f_rint" 5
---- a/src/gcc/config/i386/i386.c
-+++ b/src/gcc/config/i386/i386.c
-@@ -23,6 +23,7 @@ along with GCC; see the file COPYING3.  If not see
- #include "backend.h"
- #include "rtl.h"
- #include "tree.h"
-+#include "memmodel.h"
- #include "gimple.h"
- #include "cfghooks.h"
- #include "cfgloop.h"
---- a/src/gcc/config/ia64/ia64.c
-+++ b/src/gcc/config/ia64/ia64.c
-@@ -26,6 +26,7 @@ along with GCC; see the file COPYING3.  If not see
- #include "target.h"
- #include "rtl.h"
- #include "tree.h"
-+#include "memmodel.h"
- #include "cfghooks.h"
- #include "df.h"
- #include "tm_p.h"
---- a/src/gcc/config/linux.c
-+++ b/src/gcc/config/linux.c
-@@ -26,7 +26,7 @@ along with GCC; see the file COPYING3.  If not see
- bool
- linux_libc_has_function (enum function_class fn_class)
- {
--  if (OPTION_GLIBC)
-+  if (OPTION_GLIBC || OPTION_MUSL)
-     return true;
-   if (OPTION_BIONIC)
-     if (fn_class == function_c94
---- a/src/gcc/config/mips/mips.c
-+++ b/src/gcc/config/mips/mips.c
-@@ -28,6 +28,7 @@ along with GCC; see the file COPYING3.  If not see
- #include "target.h"
- #include "rtl.h"
- #include "tree.h"
-+#include "memmodel.h"
- #include "gimple.h"
- #include "cfghooks.h"
- #include "df.h"
---- a/src/gcc/config/rs6000/rs6000.c
-+++ b/src/gcc/config/rs6000/rs6000.c
-@@ -24,6 +24,7 @@
- #include "backend.h"
- #include "rtl.h"
- #include "tree.h"
-+#include "memmodel.h"
- #include "gimple.h"
- #include "cfghooks.h"
- #include "cfgloop.h"
---- a/src/gcc/config/sparc/sparc.c
-+++ b/src/gcc/config/sparc/sparc.c
-@@ -27,6 +27,7 @@ along with GCC; see the file COPYING3.  If not see
- #include "target.h"
- #include "rtl.h"
- #include "tree.h"
-+#include "memmodel.h"
- #include "gimple.h"
- #include "df.h"
- #include "tm_p.h"
---- a/src/gcc/configure
-+++ b/src/gcc/configure
-@@ -1711,7 +1711,8 @@ Optional Packages:
-   --with-stabs            arrange to use stabs instead of host debug format
-   --with-dwarf2           force the default debug format to be DWARF 2
-   --with-specs=SPECS      add SPECS to driver command-line processing
--  --with-pkgversion=PKG   Use PKG in the version string in place of "GCC"
-+  --with-pkgversion=PKG   Use PKG in the version string in place of "Linaro
-+                          GCC `cat $srcdir/LINARO-VERSION`"
-   --with-bugurl=URL       Direct users to URL to report a bug
-   --with-multilib-list    select multilibs (AArch64, SH and x86-64 only)
-   --with-gnu-ld           assume the C compiler uses GNU ld default=no
-@@ -7658,7 +7659,7 @@ if test "${with_pkgversion+set}" = set; then :
-       *)   PKGVERSION="($withval) " ;;
-      esac
- else
--  PKGVERSION="(GCC) "
-+  PKGVERSION="(Linaro GCC `cat $srcdir/LINARO-VERSION`) "
- 
- fi
- 
-@@ -18460,7 +18461,7 @@ else
-   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
-   lt_status=$lt_dlunknown
-   cat > conftest.$ac_ext <<_LT_EOF
--#line 18463 "configure"
-+#line 18464 "configure"
- #include "confdefs.h"
- 
- #if HAVE_DLFCN_H
-@@ -18566,7 +18567,7 @@ else
-   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
-   lt_status=$lt_dlunknown
-   cat > conftest.$ac_ext <<_LT_EOF
--#line 18569 "configure"
-+#line 18570 "configure"
- #include "confdefs.h"
- 
- #if HAVE_DLFCN_H
---- a/src/gcc/configure.ac
-+++ b/src/gcc/configure.ac
-@@ -910,7 +910,7 @@ AC_ARG_WITH(specs,
- )
- AC_SUBST(CONFIGURE_SPECS)
- 
--ACX_PKGVERSION([GCC])
-+ACX_PKGVERSION([Linaro GCC `cat $srcdir/LINARO-VERSION`])
- ACX_BUGURL([http://gcc.gnu.org/bugs.html])
- 
- # Sanity check enable_languages in case someone does not run the toplevel
---- a/src/gcc/cppbuiltin.c
-+++ b/src/gcc/cppbuiltin.c
-@@ -52,18 +52,41 @@ parse_basever (int *major, int *minor, int *patchlevel)
-     *patchlevel = s_patchlevel;
- }
- 
-+/* Parse a LINAROVER version string of the format "M.m-year.month[-spin][~dev]"
-+   to create Linaro release number YYYYMM and spin version.  */
-+static void
-+parse_linarover (int *release, int *spin)
-+{
-+  static int s_year = -1, s_month, s_spin;
-+
-+  if (s_year == -1)
-+    if (sscanf (LINAROVER, "%*[^-]-%d.%d-%d", &s_year, &s_month, &s_spin) != 3)
-+      {
-+	sscanf (LINAROVER, "%*[^-]-%d.%d", &s_year, &s_month);
-+	s_spin = 0;
-+      }
-+
-+  if (release)
-+    *release = s_year * 100 + s_month;
-+
-+  if (spin)
-+    *spin = s_spin;
-+}
- 
- /* Define __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__ and __VERSION__.  */
- static void
- define__GNUC__ (cpp_reader *pfile)
- {
--  int major, minor, patchlevel;
-+  int major, minor, patchlevel, linaro_release, linaro_spin;
- 
-   parse_basever (&major, &minor, &patchlevel);
-+  parse_linarover (&linaro_release, &linaro_spin);
-   cpp_define_formatted (pfile, "__GNUC__=%d", major);
-   cpp_define_formatted (pfile, "__GNUC_MINOR__=%d", minor);
-   cpp_define_formatted (pfile, "__GNUC_PATCHLEVEL__=%d", patchlevel);
-   cpp_define_formatted (pfile, "__VERSION__=\"%s\"", version_string);
-+  cpp_define_formatted (pfile, "__LINARO_RELEASE__=%d", linaro_release);
-+  cpp_define_formatted (pfile, "__LINARO_SPIN__=%d", linaro_spin);
-   cpp_define_formatted (pfile, "__ATOMIC_RELAXED=%d", MEMMODEL_RELAXED);
-   cpp_define_formatted (pfile, "__ATOMIC_SEQ_CST=%d", MEMMODEL_SEQ_CST);
-   cpp_define_formatted (pfile, "__ATOMIC_ACQUIRE=%d", MEMMODEL_ACQUIRE);
---- a/src/gcc/defaults.h
-+++ b/src/gcc/defaults.h
-@@ -971,11 +971,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
- #define REG_WORDS_BIG_ENDIAN WORDS_BIG_ENDIAN
- #endif
- 
--#ifdef TARGET_FLT_EVAL_METHOD
--#define TARGET_FLT_EVAL_METHOD_NON_DEFAULT 1
--#else
-+#ifndef TARGET_FLT_EVAL_METHOD
- #define TARGET_FLT_EVAL_METHOD 0
--#define TARGET_FLT_EVAL_METHOD_NON_DEFAULT 0
- #endif
- 
- #ifndef TARGET_DEC_EVAL_METHOD
---- a/src/gcc/expmed.c
-+++ b/src/gcc/expmed.c
-@@ -2522,16 +2522,8 @@ expand_variable_shift (enum tree_code code, machine_mode mode, rtx shifted,
- }
- 
- 
--/* Indicates the type of fixup needed after a constant multiplication.
--   BASIC_VARIANT means no fixup is needed, NEGATE_VARIANT means that
--   the result should be negated, and ADD_VARIANT means that the
--   multiplicand should be added to the result.  */
--enum mult_variant {basic_variant, negate_variant, add_variant};
--
- static void synth_mult (struct algorithm *, unsigned HOST_WIDE_INT,
- 			const struct mult_cost *, machine_mode mode);
--static bool choose_mult_variant (machine_mode, HOST_WIDE_INT,
--				 struct algorithm *, enum mult_variant *, int);
- static rtx expand_mult_const (machine_mode, rtx, HOST_WIDE_INT, rtx,
- 			      const struct algorithm *, enum mult_variant);
- static unsigned HOST_WIDE_INT invert_mod2n (unsigned HOST_WIDE_INT, int);
-@@ -3021,7 +3013,7 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
-    Return true if the cheapest of these cost less than MULT_COST,
-    describing the algorithm in *ALG and final fixup in *VARIANT.  */
- 
--static bool
-+bool
- choose_mult_variant (machine_mode mode, HOST_WIDE_INT val,
- 		     struct algorithm *alg, enum mult_variant *variant,
- 		     int mult_cost)
---- a/src/gcc/expmed.h
-+++ b/src/gcc/expmed.h
-@@ -35,6 +35,15 @@ enum alg_code {
-   alg_impossible
- };
- 
-+/* Indicates the type of fixup needed after a constant multiplication.
-+   BASIC_VARIANT means no fixup is needed, NEGATE_VARIANT means that
-+   the result should be negated, and ADD_VARIANT means that the
-+   multiplicand should be added to the result.  */
-+enum mult_variant {basic_variant, negate_variant, add_variant};
-+
-+bool choose_mult_variant (machine_mode, HOST_WIDE_INT,
-+			  struct algorithm *, enum mult_variant *, int);
-+
- /* This structure holds the "cost" of a multiply sequence.  The
-    "cost" field holds the total rtx_cost of every operator in the
-    synthetic multiplication sequence, hence cost(a op b) is defined
---- a/src/gcc/fold-const.c
-+++ b/src/gcc/fold-const.c
-@@ -7230,7 +7230,16 @@ native_encode_real (const_tree expr, unsigned char *ptr, int len, int off)
- 	    offset += byte % UNITS_PER_WORD;
- 	}
-       else
--	offset = BYTES_BIG_ENDIAN ? 3 - byte : byte;
-+	{
-+	  offset = byte;
-+	  if (BYTES_BIG_ENDIAN)
-+	    {
-+	      /* Reverse bytes within each long, or within the entire float
-+		 if it's smaller than a long (for HFmode).  */
-+	      offset = MIN (3, total_bytes - 1) - offset;
-+	      gcc_assert (offset >= 0);
-+	    }
-+	}
-       offset = offset + ((bitpos / BITS_PER_UNIT) & ~3);
-       if (offset >= off
- 	  && offset - off < len)
---- a/src/gcc/fortran/options.c
-+++ b/src/gcc/fortran/options.c
-@@ -208,8 +208,7 @@ gfc_post_options (const char **pfilename)
- 
-   /* Excess precision other than "fast" requires front-end
-      support.  */
--  if (flag_excess_precision_cmdline == EXCESS_PRECISION_STANDARD
--      && TARGET_FLT_EVAL_METHOD_NON_DEFAULT)
-+  if (flag_excess_precision_cmdline == EXCESS_PRECISION_STANDARD)
-     sorry ("-fexcess-precision=standard for Fortran");
-   flag_excess_precision_cmdline = EXCESS_PRECISION_FAST;
- 
---- a/src/gcc/genconditions.c
-+++ b/src/gcc/genconditions.c
-@@ -94,6 +94,7 @@ write_header (void)
- #include \"resource.h\"\n\
- #include \"diagnostic-core.h\"\n\
- #include \"reload.h\"\n\
-+#include \"memmodel.h\"\n\
- #include \"tm-constrs.h\"\n");
- 
-   if (saw_eh_return)
---- a/src/gcc/genemit.c
-+++ b/src/gcc/genemit.c
-@@ -792,6 +792,7 @@ from the machine description file `md'.  */\n\n");
-   printf ("#include \"reload.h\"\n");
-   printf ("#include \"diagnostic-core.h\"\n");
-   printf ("#include \"regs.h\"\n");
-+  printf ("#include \"memmodel.h\"\n");
-   printf ("#include \"tm-constrs.h\"\n");
-   printf ("#include \"ggc.h\"\n");
-   printf ("#include \"dumpfile.h\"\n");
---- a/src/gcc/genmultilib
-+++ b/src/gcc/genmultilib
-@@ -186,7 +186,8 @@ fi
- EOF
- chmod +x tmpmultilib
- 
--combinations=`initial=/ ./tmpmultilib ${options}`
-+combination_space=`initial=/ ./tmpmultilib ${options}`
-+combinations="$combination_space"
- 
- # If there exceptions, weed them out now
- if [ -n "${exceptions}" ]; then
-@@ -472,14 +473,19 @@ for rrule in ${multilib_reuse}; do
-   # in this variable, it means no multilib will be built for current reuse
-   # rule.  Thus the reuse purpose specified by current rule is meaningless.
-   if expr "${combinations} " : ".*/${combo}/.*" > /dev/null; then
--    combo="/${combo}/"
--    dirout=`./tmpmultilib3 "${combo}" "${todirnames}" "${toosdirnames}" "${enable_multilib}"`
--    copts="/${copts}/"
--    optout=`./tmpmultilib4 "${copts}" "${options}"`
--    # Output the line with all appropriate matches.
--    dirout="${dirout}" optout="${optout}" ./tmpmultilib2
-+    if expr "${combination_space} " : ".*/${copts}/.*" > /dev/null; then
-+      combo="/${combo}/"
-+      dirout=`./tmpmultilib3 "${combo}" "${todirnames}" "${toosdirnames}" "${enable_multilib}"`
-+      copts="/${copts}/"
-+      optout=`./tmpmultilib4 "${copts}" "${options}"`
-+      # Output the line with all appropriate matches.
-+      dirout="${dirout}" optout="${optout}" ./tmpmultilib2
-+    else
-+      echo "The rule ${rrule} contains an option absent from MULTILIB_OPTIONS." >&2
-+      exit 1
-+    fi
-   else
--    echo "The rule ${rrule} is trying to reuse nonexistent multilib."
-+    echo "The rule ${rrule} is trying to reuse nonexistent multilib." >&2
-     exit 1
-   fi
- done
---- a/src/gcc/genoutput.c
-+++ b/src/gcc/genoutput.c
-@@ -231,6 +231,7 @@ output_prologue (void)
-   printf ("#include \"diagnostic-core.h\"\n");
-   printf ("#include \"output.h\"\n");
-   printf ("#include \"target.h\"\n");
-+  printf ("#include \"memmodel.h\"\n");
-   printf ("#include \"tm-constrs.h\"\n");
- }
- 
---- a/src/gcc/genpeep.c
-+++ b/src/gcc/genpeep.c
-@@ -373,6 +373,7 @@ from the machine description file `md'.  */\n\n");
-   printf ("#include \"except.h\"\n");
-   printf ("#include \"diagnostic-core.h\"\n");
-   printf ("#include \"flags.h\"\n");
-+  printf ("#include \"memmodel.h\"\n");
-   printf ("#include \"tm-constrs.h\"\n\n");
- 
-   printf ("extern rtx peep_operand[];\n\n");
---- a/src/gcc/genpreds.c
-+++ b/src/gcc/genpreds.c
-@@ -1577,6 +1577,7 @@ write_insn_preds_c (void)
- #include \"reload.h\"\n\
- #include \"regs.h\"\n\
- #include \"emit-rtl.h\"\n\
-+#include \"memmodel.h\"\n\
- #include \"tm-constrs.h\"\n");
- 
-   FOR_ALL_PREDICATES (p)
---- a/src/gcc/genrecog.c
-+++ b/src/gcc/genrecog.c
-@@ -4172,6 +4172,7 @@ write_header (void)
- #include \"diagnostic-core.h\"\n\
- #include \"reload.h\"\n\
- #include \"regs.h\"\n\
-+#include \"memmodel.h\"\n\
- #include \"tm-constrs.h\"\n\
- \n");
- 
---- a/src/gcc/gimple-fold.c
-+++ b/src/gcc/gimple-fold.c
-@@ -1379,6 +1379,55 @@ gimple_fold_builtin_strncpy (gimple_stmt_iterator *gsi,
-   return true;
- }
- 
-+/* Simplify strchr (str, 0) into str + strlen (str).
-+   In general strlen is significantly faster than strchr
-+   due to being a simpler operation.  */
-+static bool
-+gimple_fold_builtin_strchr (gimple_stmt_iterator *gsi)
-+{
-+  gimple *stmt = gsi_stmt (*gsi);
-+  tree str = gimple_call_arg (stmt, 0);
-+  tree c = gimple_call_arg (stmt, 1);
-+  location_t loc = gimple_location (stmt);
-+
-+  if (optimize_function_for_size_p (cfun))
-+    return false;
-+
-+  if (!integer_zerop (c) || !gimple_call_lhs (stmt))
-+    return false;
-+
-+  tree len;
-+  tree strlen_fn = builtin_decl_implicit (BUILT_IN_STRLEN);
-+
-+  if (!strlen_fn)
-+    return false;
-+
-+  /* Create newstr = strlen (str).  */
-+  gimple_seq stmts = NULL;
-+  gimple *new_stmt = gimple_build_call (strlen_fn, 1, str);
-+  gimple_set_location (new_stmt, loc);
-+  if (gimple_in_ssa_p (cfun))
-+    len = make_ssa_name (size_type_node);
-+  else
-+    len = create_tmp_reg (size_type_node);
-+  gimple_call_set_lhs (new_stmt, len);
-+  gimple_seq_add_stmt_without_update (&stmts, new_stmt);
-+
-+  /* Create (str p+ strlen (str)).  */
-+  new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
-+				  POINTER_PLUS_EXPR, str, len);
-+  gimple_seq_add_stmt_without_update (&stmts, new_stmt);
-+  gsi_replace_with_seq_vops (gsi, stmts);
-+  /* gsi now points at the assignment to the lhs, get a
-+     stmt iterator to the strlen.
-+     ???  We can't use gsi_for_stmt as that doesn't work when the
-+     CFG isn't built yet.  */
-+  gimple_stmt_iterator gsi2 = *gsi;
-+  gsi_prev (&gsi2);
-+  fold_stmt (&gsi2);
-+  return true;
-+}
-+
- /* Simplify a call to the strcat builtin.  DST and SRC are the arguments
-    to the call.
- 
-@@ -2820,6 +2869,11 @@ gimple_fold_builtin (gimple_stmt_iterator *gsi)
- 					 gimple_call_arg (stmt, 1));
-     case BUILT_IN_STRNCAT:
-       return gimple_fold_builtin_strncat (gsi);
-+    case BUILT_IN_STRCHR:
-+      if (gimple_fold_builtin_strchr (gsi))
-+	return true;
-+      /* Perform additional folding in builtin.c.  */
-+      break;
-     case BUILT_IN_FPUTS:
-       return gimple_fold_builtin_fputs (gsi, gimple_call_arg (stmt, 0),
- 					gimple_call_arg (stmt, 1), false);
---- a/src/gcc/ifcvt.c
-+++ b/src/gcc/ifcvt.c
-@@ -813,10 +813,15 @@ struct noce_if_info
- 
-   /* Estimated cost of the particular branch instruction.  */
-   unsigned int branch_cost;
-+
-+  /* The name of the noce transform that succeeded in if-converting
-+     this structure.  Used for debugging.  */
-+  const char *transform_name;
- };
- 
- static rtx noce_emit_store_flag (struct noce_if_info *, rtx, int, int);
- static int noce_try_move (struct noce_if_info *);
-+static int noce_try_ifelse_collapse (struct noce_if_info *);
- static int noce_try_store_flag (struct noce_if_info *);
- static int noce_try_addcc (struct noce_if_info *);
- static int noce_try_store_flag_constants (struct noce_if_info *);
-@@ -1115,11 +1120,45 @@ noce_try_move (struct noce_if_info *if_info)
- 	  emit_insn_before_setloc (seq, if_info->jump,
- 				   INSN_LOCATION (if_info->insn_a));
- 	}
-+      if_info->transform_name = "noce_try_move";
-       return TRUE;
-     }
-   return FALSE;
- }
- 
-+/* Try forming an IF_THEN_ELSE (cond, b, a) and collapsing that
-+   through simplify_rtx.  Sometimes that can eliminate the IF_THEN_ELSE.
-+   If that is the case, emit the result into x.  */
-+
-+static int
-+noce_try_ifelse_collapse (struct noce_if_info * if_info)
-+{
-+  if (!noce_simple_bbs (if_info))
-+    return FALSE;
-+
-+  machine_mode mode = GET_MODE (if_info->x);
-+  rtx if_then_else = simplify_gen_ternary (IF_THEN_ELSE, mode, mode,
-+					    if_info->cond, if_info->b,
-+					    if_info->a);
-+
-+  if (GET_CODE (if_then_else) == IF_THEN_ELSE)
-+    return FALSE;
-+
-+  rtx_insn *seq;
-+  start_sequence ();
-+  noce_emit_move_insn (if_info->x, if_then_else);
-+  seq = end_ifcvt_sequence (if_info);
-+  if (!seq)
-+    return FALSE;
-+
-+  emit_insn_before_setloc (seq, if_info->jump,
-+			  INSN_LOCATION (if_info->insn_a));
-+
-+  if_info->transform_name = "noce_try_ifelse_collapse";
-+  return TRUE;
-+}
-+
-+
- /* Convert "if (test) x = 1; else x = 0".
- 
-    Only try 0 and STORE_FLAG_VALUE here.  Other combinations will be
-@@ -1163,6 +1202,7 @@ noce_try_store_flag (struct noce_if_info *if_info)
- 
-       emit_insn_before_setloc (seq, if_info->jump,
- 			       INSN_LOCATION (if_info->insn_a));
-+      if_info->transform_name = "noce_try_store_flag";
-       return TRUE;
-     }
-   else
-@@ -1241,6 +1281,7 @@ noce_try_inverse_constants (struct noce_if_info *if_info)
- 
-       emit_insn_before_setloc (seq, if_info->jump,
- 			       INSN_LOCATION (if_info->insn_a));
-+      if_info->transform_name = "noce_try_inverse_constants";
-       return true;
-     }
- 
-@@ -1461,6 +1502,8 @@ noce_try_store_flag_constants (struct noce_if_info *if_info)
- 
-       emit_insn_before_setloc (seq, if_info->jump,
- 			       INSN_LOCATION (if_info->insn_a));
-+      if_info->transform_name = "noce_try_store_flag_constants";
-+
-       return TRUE;
-     }
- 
-@@ -1513,6 +1556,8 @@ noce_try_addcc (struct noce_if_info *if_info)
- 
- 	      emit_insn_before_setloc (seq, if_info->jump,
- 				       INSN_LOCATION (if_info->insn_a));
-+	      if_info->transform_name = "noce_try_addcc";
-+
- 	      return TRUE;
- 	    }
- 	  end_sequence ();
-@@ -1553,6 +1598,7 @@ noce_try_addcc (struct noce_if_info *if_info)
- 
- 	      emit_insn_before_setloc (seq, if_info->jump,
- 				       INSN_LOCATION (if_info->insn_a));
-+	      if_info->transform_name = "noce_try_addcc";
- 	      return TRUE;
- 	    }
- 	  end_sequence ();
-@@ -1617,6 +1663,8 @@ noce_try_store_flag_mask (struct noce_if_info *if_info)
- 
- 	  emit_insn_before_setloc (seq, if_info->jump,
- 				   INSN_LOCATION (if_info->insn_a));
-+	  if_info->transform_name = "noce_try_store_flag_mask";
-+
- 	  return TRUE;
- 	}
- 
-@@ -1767,6 +1815,8 @@ noce_try_cmove (struct noce_if_info *if_info)
- 
- 	  emit_insn_before_setloc (seq, if_info->jump,
- 				   INSN_LOCATION (if_info->insn_a));
-+	  if_info->transform_name = "noce_try_cmove";
-+
- 	  return TRUE;
- 	}
-       /* If both a and b are constants try a last-ditch transformation:
-@@ -1820,6 +1870,7 @@ noce_try_cmove (struct noce_if_info *if_info)
- 
- 	      emit_insn_before_setloc (seq, if_info->jump,
- 				   INSN_LOCATION (if_info->insn_a));
-+	      if_info->transform_name = "noce_try_cmove";
- 	      return TRUE;
- 	    }
- 	  else
-@@ -2273,6 +2324,7 @@ noce_try_cmove_arith (struct noce_if_info *if_info)
- 
-   emit_insn_before_setloc (ifcvt_seq, if_info->jump,
- 			   INSN_LOCATION (if_info->insn_a));
-+  if_info->transform_name = "noce_try_cmove_arith";
-   return TRUE;
- 
-  end_seq_and_fail:
-@@ -2364,28 +2416,32 @@ noce_get_alt_condition (struct noce_if_info *if_info, rtx target,
- 	  switch (code)
- 	    {
- 	    case LT:
--	      if (actual_val == desired_val + 1)
-+	      if (desired_val != HOST_WIDE_INT_MAX
-+		  && actual_val == desired_val + 1)
- 		{
- 		  code = LE;
- 		  op_b = GEN_INT (desired_val);
- 		}
- 	      break;
- 	    case LE:
--	      if (actual_val == desired_val - 1)
-+	      if (desired_val != HOST_WIDE_INT_MIN
-+		  && actual_val == desired_val - 1)
- 		{
- 		  code = LT;
- 		  op_b = GEN_INT (desired_val);
- 		}
- 	      break;
- 	    case GT:
--	      if (actual_val == desired_val - 1)
-+	      if (desired_val != HOST_WIDE_INT_MIN
-+		  && actual_val == desired_val - 1)
- 		{
- 		  code = GE;
- 		  op_b = GEN_INT (desired_val);
- 		}
- 	      break;
- 	    case GE:
--	      if (actual_val == desired_val + 1)
-+	      if (desired_val != HOST_WIDE_INT_MAX
-+		  && actual_val == desired_val + 1)
- 		{
- 		  code = GT;
- 		  op_b = GEN_INT (desired_val);
-@@ -2525,6 +2581,7 @@ noce_try_minmax (struct noce_if_info *if_info)
-   emit_insn_before_setloc (seq, if_info->jump, INSN_LOCATION (if_info->insn_a));
-   if_info->cond = cond;
-   if_info->cond_earliest = earliest;
-+  if_info->transform_name = "noce_try_minmax";
- 
-   return TRUE;
- }
-@@ -2691,6 +2748,7 @@ noce_try_abs (struct noce_if_info *if_info)
-   emit_insn_before_setloc (seq, if_info->jump, INSN_LOCATION (if_info->insn_a));
-   if_info->cond = cond;
-   if_info->cond_earliest = earliest;
-+  if_info->transform_name = "noce_try_abs";
- 
-   return TRUE;
- }
-@@ -2772,6 +2830,8 @@ noce_try_sign_mask (struct noce_if_info *if_info)
-     return FALSE;
- 
-   emit_insn_before_setloc (seq, if_info->jump, INSN_LOCATION (if_info->insn_a));
-+  if_info->transform_name = "noce_try_sign_mask";
-+
-   return TRUE;
- }
- 
-@@ -2877,6 +2937,7 @@ noce_try_bitop (struct noce_if_info *if_info)
-       emit_insn_before_setloc (seq, if_info->jump,
- 			       INSN_LOCATION (if_info->insn_a));
-     }
-+  if_info->transform_name = "noce_try_bitop";
-   return TRUE;
- }
- 
-@@ -3167,6 +3228,41 @@ noce_convert_multiple_sets (struct noce_if_info *if_info)
-       if (if_info->then_else_reversed)
- 	std::swap (old_val, new_val);
- 
-+
-+      /* We allow simple lowpart register subreg SET sources in
-+	 bb_ok_for_noce_convert_multiple_sets.  Be careful when processing
-+	 sequences like:
-+	 (set (reg:SI r1) (reg:SI r2))
-+	 (set (reg:HI r3) (subreg:HI (r1)))
-+	 For the second insn new_val or old_val (r1 in this example) will be
-+	 taken from the temporaries and have the wider mode which will not
-+	 match with the mode of the other source of the conditional move, so
-+	 we'll end up trying to emit r4:HI = cond ? (r1:SI) : (r3:HI).
-+	 Wrap the two cmove operands into subregs if appropriate to prevent
-+	 that.  */
-+      if (GET_MODE (new_val) != GET_MODE (temp))
-+	{
-+	  machine_mode src_mode = GET_MODE (new_val);
-+	  machine_mode dst_mode = GET_MODE (temp);
-+	  if (GET_MODE_SIZE (src_mode) <= GET_MODE_SIZE (dst_mode))
-+	    {
-+	      end_sequence ();
-+	      return FALSE;
-+	    }
-+	  new_val = lowpart_subreg (dst_mode, new_val, src_mode);
-+	}
-+      if (GET_MODE (old_val) != GET_MODE (temp))
-+	{
-+	  machine_mode src_mode = GET_MODE (old_val);
-+	  machine_mode dst_mode = GET_MODE (temp);
-+	  if (GET_MODE_SIZE (src_mode) <= GET_MODE_SIZE (dst_mode))
-+	    {
-+	      end_sequence ();
-+	      return FALSE;
-+	    }
-+	  old_val = lowpart_subreg (dst_mode, old_val, src_mode);
-+	}
-+
-       /* Actually emit the conditional move.  */
-       rtx temp_dest = noce_emit_cmove (if_info, temp, cond_code,
- 				       x, y, new_val, old_val);
-@@ -3240,6 +3336,7 @@ noce_convert_multiple_sets (struct noce_if_info *if_info)
-     }
- 
-   num_updated_if_blocks++;
-+  if_info->transform_name = "noce_convert_multiple_sets";
-   return TRUE;
- }
- 
-@@ -3277,9 +3374,15 @@ bb_ok_for_noce_convert_multiple_sets (basic_block test_bb,
-       rtx src = SET_SRC (set);
- 
-       /* We can possibly relax this, but for now only handle REG to REG
--	 moves.  This avoids any issues that might come from introducing
--	 loads/stores that might violate data-race-freedom guarantees.  */
--      if (!(REG_P (src) && REG_P (dest)))
-+	 (including subreg) moves.  This avoids any issues that might come
-+	 from introducing loads/stores that might violate data-race-freedom
-+	 guarantees.  */
-+      if (!REG_P (dest))
-+	return false;
-+
-+      if (!(REG_P (src)
-+	   || (GET_CODE (src) == SUBREG && REG_P (SUBREG_REG (src))
-+	       && subreg_lowpart_p (src))))
- 	return false;
- 
-       /* Destination must be appropriate for a conditional write.  */
-@@ -3336,7 +3439,12 @@ noce_process_if_block (struct noce_if_info *if_info)
-       && bb_ok_for_noce_convert_multiple_sets (then_bb, if_info))
-     {
-       if (noce_convert_multiple_sets (if_info))
--	return TRUE;
-+	{
-+	  if (dump_file && if_info->transform_name)
-+	    fprintf (dump_file, "if-conversion succeeded through %s\n",
-+		     if_info->transform_name);
-+	  return TRUE;
-+	}
-     }
- 
-   if (! bb_valid_for_noce_process_p (then_bb, cond, &if_info->then_cost,
-@@ -3493,6 +3601,8 @@ noce_process_if_block (struct noce_if_info *if_info)
- 
-   if (noce_try_move (if_info))
-     goto success;
-+  if (noce_try_ifelse_collapse (if_info))
-+    goto success;
-   if (noce_try_store_flag (if_info))
-     goto success;
-   if (noce_try_bitop (if_info))
-@@ -3533,6 +3643,9 @@ noce_process_if_block (struct noce_if_info *if_info)
-   return FALSE;
- 
-  success:
-+  if (dump_file && if_info->transform_name)
-+    fprintf (dump_file, "if-conversion succeeded through %s\n",
-+	     if_info->transform_name);
- 
-   /* If we used a temporary, fix it up now.  */
-   if (orig_x != x)
---- a/src/gcc/internal-fn.c
-+++ b/src/gcc/internal-fn.c
-@@ -1812,11 +1812,7 @@ expand_arith_overflow (enum tree_code code, gimple *stmt)
-       /* For sub-word operations, retry with a wider type first.  */
-       if (orig_precres == precres && precop <= BITS_PER_WORD)
- 	{
--#if WORD_REGISTER_OPERATIONS
--	  int p = BITS_PER_WORD;
--#else
--	  int p = precop;
--#endif
-+	  int p = WORD_REGISTER_OPERATIONS ? BITS_PER_WORD : precop;
- 	  enum machine_mode m = smallest_mode_for_size (p, MODE_INT);
- 	  tree optype = build_nonstandard_integer_type (GET_MODE_PRECISION (m),
- 							uns0_p && uns1_p
---- a/src/gcc/java/lang.c
-+++ b/src/gcc/java/lang.c
-@@ -569,8 +569,7 @@ java_post_options (const char **pfilename)
- 
-   /* Excess precision other than "fast" requires front-end
-      support.  */
--  if (flag_excess_precision_cmdline == EXCESS_PRECISION_STANDARD
--      && TARGET_FLT_EVAL_METHOD_NON_DEFAULT)
-+  if (flag_excess_precision_cmdline == EXCESS_PRECISION_STANDARD)
-     sorry ("-fexcess-precision=standard for Java");
-   flag_excess_precision_cmdline = EXCESS_PRECISION_FAST;
- 
---- a/src/gcc/lra-constraints.c
-+++ b/src/gcc/lra-constraints.c
-@@ -1326,7 +1326,22 @@ process_addr_reg (rtx *loc, bool check_only_p, rtx_insn **before, rtx_insn **aft
- 
-   subreg_p = GET_CODE (*loc) == SUBREG;
-   if (subreg_p)
--    loc = &SUBREG_REG (*loc);
-+    {
-+      reg = SUBREG_REG (*loc);
-+      mode = GET_MODE (reg);
-+
-+      /* For mode with size bigger than ptr_mode, there unlikely to be "mov"
-+	 between two registers with different classes, but there normally will
-+	 be "mov" which transfers element of vector register into the general
-+	 register, and this normally will be a subreg which should be reloaded
-+	 as a whole.  This is particularly likely to be triggered when
-+	 -fno-split-wide-types specified.  */
-+      if (!REG_P (reg)
-+	  || in_class_p (reg, cl, &new_class)
-+	  || GET_MODE_SIZE (mode) <= GET_MODE_SIZE (ptr_mode))
-+       loc = &SUBREG_REG (*loc);
-+    }
-+
-   reg = *loc;
-   mode = GET_MODE (reg);
-   if (! REG_P (reg))
-@@ -2475,14 +2490,29 @@ process_alt_operands (int only_alternative)
- 	      /* We are trying to spill pseudo into memory.  It is
- 		 usually more costly than moving to a hard register
- 		 although it might takes the same number of
--		 reloads.  */
--	      if (no_regs_p && REG_P (op) && hard_regno[nop] >= 0)
-+		 reloads.
-+
-+		 Non-pseudo spill may happen also.  Suppose a target allows both
-+		 register and memory in the operand constraint alternatives,
-+		 then it's typical that an eliminable register has a substition
-+		 of "base + offset" which can either be reloaded by a simple
-+		 "new_reg <= base + offset" which will match the register
-+		 constraint, or a similar reg addition followed by further spill
-+		 to and reload from memory which will match the memory
-+		 constraint, but this memory spill will be much more costly
-+		 usually.
-+
-+		 Code below increases the reject for both pseudo and non-pseudo
-+		 spill.  */
-+	      if (no_regs_p
-+		  && !(MEM_P (op) && offmemok)
-+		  && !(REG_P (op) && hard_regno[nop] < 0))
- 		{
- 		  if (lra_dump_file != NULL)
- 		    fprintf
- 		      (lra_dump_file,
--		       "            %d Spill pseudo into memory: reject+=3\n",
--		       nop);
-+		       "            %d Spill %spseudo into memory: reject+=3\n",
-+		       nop, REG_P (op) ? "" : "Non-");
- 		  reject += 3;
- 		  if (VECTOR_MODE_P (mode))
- 		    {
---- a/src/gcc/lto/lto-partition.c
-+++ b/src/gcc/lto/lto-partition.c
-@@ -447,7 +447,7 @@ add_sorted_nodes (vec<symtab_node *> &next_nodes, ltrans_partition partition)
-    and in-partition calls was reached.  */
- 
- void
--lto_balanced_map (int n_lto_partitions)
-+lto_balanced_map (int n_lto_partitions, int max_partition_size)
- {
-   int n_nodes = 0;
-   int n_varpool_nodes = 0, varpool_pos = 0, best_varpool_pos = 0;
-@@ -511,6 +511,9 @@ lto_balanced_map (int n_lto_partitions)
-   varpool_order.qsort (varpool_node_cmp);
- 
-   /* Compute partition size and create the first partition.  */
-+  if (PARAM_VALUE (MIN_PARTITION_SIZE) > max_partition_size)
-+    fatal_error (input_location, "min partition size cannot be greater than max partition size");
-+
-   partition_size = total_size / n_lto_partitions;
-   if (partition_size < PARAM_VALUE (MIN_PARTITION_SIZE))
-     partition_size = PARAM_VALUE (MIN_PARTITION_SIZE);
-@@ -719,7 +722,8 @@ lto_balanced_map (int n_lto_partitions)
- 		 best_cost, best_internal, best_i);
-       /* Partition is too large, unwind into step when best cost was reached and
- 	 start new partition.  */
--      if (partition->insns > 2 * partition_size)
-+      if (partition->insns > 2 * partition_size
-+	  || partition->insns > max_partition_size)
- 	{
- 	  if (best_i != i)
- 	    {
---- a/src/gcc/lto/lto-partition.h
-+++ b/src/gcc/lto/lto-partition.h
-@@ -35,7 +35,7 @@ extern vec<ltrans_partition> ltrans_partitions;
- 
- void lto_1_to_1_map (void);
- void lto_max_map (void);
--void lto_balanced_map (int);
-+void lto_balanced_map (int, int);
- void lto_promote_cross_file_statics (void);
- void free_ltrans_partitions (void);
- void lto_promote_statics_nonwpa (void);
---- a/src/gcc/lto/lto.c
-+++ b/src/gcc/lto/lto.c
-@@ -3123,9 +3123,10 @@ do_whole_program_analysis (void)
-   else if (flag_lto_partition == LTO_PARTITION_MAX)
-     lto_max_map ();
-   else if (flag_lto_partition == LTO_PARTITION_ONE)
--    lto_balanced_map (1);
-+    lto_balanced_map (1, INT_MAX);
-   else if (flag_lto_partition == LTO_PARTITION_BALANCED)
--    lto_balanced_map (PARAM_VALUE (PARAM_LTO_PARTITIONS));
-+    lto_balanced_map (PARAM_VALUE (PARAM_LTO_PARTITIONS),
-+		      PARAM_VALUE (MAX_PARTITION_SIZE));
-   else
-     gcc_unreachable ();
- 
---- a/src/gcc/match.pd
-+++ b/src/gcc/match.pd
-@@ -468,6 +468,12 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
-  (bit_and:c (convert? @0) (convert? (bit_not @0)))
-   { build_zero_cst (type); })
- 
-+/* PR71636: Transform x & ((1U << b) - 1) -> x & ~(~0U << b);  */
-+(simplify
-+  (bit_and:c @0 (plus:s (lshift:s integer_onep @1) integer_minus_onep))
-+  (if (TYPE_UNSIGNED (type))
-+    (bit_and @0 (bit_not (lshift { build_all_ones_cst (type); } @1)))))
-+
- /* Fold (A & ~B) - (A & B) into (A ^ B) - B.  */
- (simplify
-  (minus (bit_and:cs @0 (bit_not @1)) (bit_and:cs @0 @1))
---- /dev/null
-+++ b/src/gcc/memmodel.h
-@@ -0,0 +1,86 @@
-+/* Prototypes of memory model helper functions.
-+   Copyright (C) 2015-2016 Free Software Foundation, Inc.
-+
-+This file is part of GCC.
-+
-+GCC is free software; you can redistribute it and/or modify it under
-+the terms of the GNU General Public License as published by the Free
-+Software Foundation; either version 3, or (at your option) any later
-+version.
-+
-+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
-+WARRANTY; without even the implied warranty of MERCHANTABILITY or
-+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-+for more details.
-+
-+You should have received a copy of the GNU General Public License
-+along with GCC; see the file COPYING3.  If not see
-+<http://www.gnu.org/licenses/>.  */
-+
-+#ifndef GCC_MEMMODEL_H
-+#define GCC_MEMMODEL_H
-+
-+/* Return the memory model from a host integer.  */
-+static inline enum memmodel
-+memmodel_from_int (unsigned HOST_WIDE_INT val)
-+{
-+  return (enum memmodel) (val & MEMMODEL_MASK);
-+}
-+
-+/* Return the base memory model from a host integer.  */
-+static inline enum memmodel
-+memmodel_base (unsigned HOST_WIDE_INT val)
-+{
-+  return (enum memmodel) (val & MEMMODEL_BASE_MASK);
-+}
-+
-+/* Return TRUE if the memory model is RELAXED.  */
-+static inline bool
-+is_mm_relaxed (enum memmodel model)
-+{
-+  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_RELAXED;
-+}
-+
-+/* Return TRUE if the memory model is CONSUME.  */
-+static inline bool
-+is_mm_consume (enum memmodel model)
-+{
-+  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_CONSUME;
-+}
-+
-+/* Return TRUE if the memory model is ACQUIRE.  */
-+static inline bool
-+is_mm_acquire (enum memmodel model)
-+{
-+  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_ACQUIRE;
-+}
-+
-+/* Return TRUE if the memory model is RELEASE.  */
-+static inline bool
-+is_mm_release (enum memmodel model)
-+{
-+  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_RELEASE;
-+}
-+
-+/* Return TRUE if the memory model is ACQ_REL.  */
-+static inline bool
-+is_mm_acq_rel (enum memmodel model)
-+{
-+  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_ACQ_REL;
-+}
-+
-+/* Return TRUE if the memory model is SEQ_CST.  */
-+static inline bool
-+is_mm_seq_cst (enum memmodel model)
-+{
-+  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_SEQ_CST;
-+}
-+
-+/* Return TRUE if the memory model is a SYNC variant.  */
-+static inline bool
-+is_mm_sync (enum memmodel model)
-+{
-+  return (model & MEMMODEL_SYNC);
-+}
-+
-+#endif  /* GCC_MEMMODEL_H  */
---- a/src/gcc/optabs.c
-+++ b/src/gcc/optabs.c
-@@ -25,6 +25,7 @@ along with GCC; see the file COPYING3.  If not see
- #include "target.h"
- #include "rtl.h"
- #include "tree.h"
-+#include "memmodel.h"
- #include "predict.h"
- #include "tm_p.h"
- #include "expmed.h"
---- a/src/gcc/params.def
-+++ b/src/gcc/params.def
-@@ -1027,7 +1027,12 @@ DEFPARAM (PARAM_LTO_PARTITIONS,
- DEFPARAM (MIN_PARTITION_SIZE,
- 	  "lto-min-partition",
- 	  "Minimal size of a partition for LTO (in estimated instructions).",
--	  1000, 0, 0)
-+	  10000, 0, 0)
-+
-+DEFPARAM (MAX_PARTITION_SIZE,
-+	  "lto-max-partition",
-+	  "Maximal size of a partition for LTO (in estimated instructions).",
-+	  1000000, 0, INT_MAX)
- 
- /* Diagnostic parameters.  */
- 
---- a/src/gcc/rtlanal.c
-+++ b/src/gcc/rtlanal.c
-@@ -3663,6 +3663,16 @@ subreg_get_info (unsigned int xregno, machine_mode xmode,
- 	  info->offset = offset / regsize_xmode;
- 	  return;
- 	}
-+      /* It's not valid to extract a subreg of mode YMODE at OFFSET that
-+	 would go outside of XMODE.  */
-+      if (!rknown
-+	  && GET_MODE_SIZE (ymode) + offset > GET_MODE_SIZE (xmode))
-+	{
-+	  info->representable_p = false;
-+	  info->nregs = nregs_ymode;
-+	  info->offset = offset / regsize_xmode;
-+	  return;
-+	}
-       /* Quick exit for the simple and common case of extracting whole
- 	 subregisters from a multiregister value.  */
-       /* ??? It would be better to integrate this into the code below,
-@@ -4590,13 +4600,14 @@ nonzero_bits1 (const_rtx x, machine_mode mode, const_rtx known_x,
- 	  nonzero &= cached_nonzero_bits (SUBREG_REG (x), mode,
- 					  known_x, known_mode, known_ret);
- 
--#if WORD_REGISTER_OPERATIONS && defined (LOAD_EXTEND_OP)
-+#ifdef LOAD_EXTEND_OP
- 	  /* If this is a typical RISC machine, we only have to worry
- 	     about the way loads are extended.  */
--	  if ((LOAD_EXTEND_OP (inner_mode) == SIGN_EXTEND
--	       ? val_signbit_known_set_p (inner_mode, nonzero)
--	       : LOAD_EXTEND_OP (inner_mode) != ZERO_EXTEND)
--	      || !MEM_P (SUBREG_REG (x)))
-+	  if (WORD_REGISTER_OPERATIONS
-+	      && ((LOAD_EXTEND_OP (inner_mode) == SIGN_EXTEND
-+		     ? val_signbit_known_set_p (inner_mode, nonzero)
-+		     : LOAD_EXTEND_OP (inner_mode) != ZERO_EXTEND)
-+		   || !MEM_P (SUBREG_REG (x))))
- #endif
- 	    {
- 	      /* On many CISC machines, accessing an object in a wider mode
---- a/src/gcc/simplify-rtx.c
-+++ b/src/gcc/simplify-rtx.c
-@@ -5274,6 +5274,50 @@ simplify_const_relational_operation (enum rtx_code code,
- 
-   return 0;
- }
-+
-+/* Recognize expressions of the form (X CMP 0) ? VAL : OP (X)
-+   where OP is CLZ or CTZ and VAL is the value from CLZ_DEFINED_VALUE_AT_ZERO
-+   or CTZ_DEFINED_VALUE_AT_ZERO respectively and return OP (X) if the expression
-+   can be simplified to that or NULL_RTX if not.
-+   Assume X is compared against zero with CMP_CODE and the true
-+   arm is TRUE_VAL and the false arm is FALSE_VAL.  */
-+
-+static rtx
-+simplify_cond_clz_ctz (rtx x, rtx_code cmp_code, rtx true_val, rtx false_val)
-+{
-+  if (cmp_code != EQ && cmp_code != NE)
-+    return NULL_RTX;
-+
-+  /* Result on X == 0 and X !=0 respectively.  */
-+  rtx on_zero, on_nonzero;
-+  if (cmp_code == EQ)
-+    {
-+      on_zero = true_val;
-+      on_nonzero = false_val;
-+    }
-+  else
-+    {
-+      on_zero = false_val;
-+      on_nonzero = true_val;
-+    }
-+
-+  rtx_code op_code = GET_CODE (on_nonzero);
-+  if ((op_code != CLZ && op_code != CTZ)
-+      || !rtx_equal_p (XEXP (on_nonzero, 0), x)
-+      || !CONST_INT_P (on_zero))
-+    return NULL_RTX;
-+
-+  HOST_WIDE_INT op_val;
-+  if (((op_code == CLZ
-+	&& CLZ_DEFINED_VALUE_AT_ZERO (GET_MODE (on_nonzero), op_val))
-+      || (op_code == CTZ
-+	  && CTZ_DEFINED_VALUE_AT_ZERO (GET_MODE (on_nonzero), op_val)))
-+      && op_val == INTVAL (on_zero))
-+    return on_nonzero;
-+
-+  return NULL_RTX;
-+}
-+
- 
- /* Simplify CODE, an operation with result mode MODE and three operands,
-    OP0, OP1, and OP2.  OP0_MODE was the mode of OP0 before it became
-@@ -5407,6 +5451,19 @@ simplify_ternary_operation (enum rtx_code code, machine_mode mode,
- 	    }
- 	}
- 
-+      /* Convert x == 0 ? N : clz (x) into clz (x) when
-+	 CLZ_DEFINED_VALUE_AT_ZERO is defined to N for the mode of x.
-+	 Similarly for ctz (x).  */
-+      if (COMPARISON_P (op0) && !side_effects_p (op0)
-+	  && XEXP (op0, 1) == const0_rtx)
-+	{
-+	  rtx simplified
-+	    = simplify_cond_clz_ctz (XEXP (op0, 0), GET_CODE (op0),
-+				     op1, op2);
-+	  if (simplified)
-+	    return simplified;
-+	}
-+
-       if (COMPARISON_P (op0) && ! side_effects_p (op0))
- 	{
- 	  machine_mode cmp_mode = (GET_MODE (XEXP (op0, 0)) == VOIDmode
---- a/src/gcc/system.h
-+++ b/src/gcc/system.h
-@@ -971,7 +971,8 @@ extern void fancy_abort (const char *, int, const char *) ATTRIBUTE_NORETURN;
- 	EXTRA_ADDRESS_CONSTRAINT CONST_DOUBLE_OK_FOR_CONSTRAINT_P	   \
- 	CALLER_SAVE_PROFITABLE LARGEST_EXPONENT_IS_NORMAL		   \
- 	ROUND_TOWARDS_ZERO SF_SIZE DF_SIZE XF_SIZE TF_SIZE LIBGCC2_TF_CEXT \
--	LIBGCC2_LONG_DOUBLE_TYPE_SIZE STRUCT_VALUE EH_FRAME_IN_DATA_SECTION
-+	LIBGCC2_LONG_DOUBLE_TYPE_SIZE STRUCT_VALUE			   \
-+	EH_FRAME_IN_DATA_SECTION TARGET_FLT_EVAL_METHOD_NON_DEFAULT
- 
- /* Hooks that are no longer used.  */
-  #pragma GCC poison LANG_HOOKS_FUNCTION_MARK LANG_HOOKS_FUNCTION_FREE	\
---- a/src/gcc/testsuite/c-c++-common/asan/clone-test-1.c
-+++ b/src/gcc/testsuite/c-c++-common/asan/clone-test-1.c
-@@ -29,6 +29,10 @@ int main(int argc, char **argv) {
-   char *sp = child_stack + kStackSize;  /* Stack grows down. */
-   printf("Parent: %p\n", sp);
-   pid_t clone_pid = clone(Child, sp, CLONE_FILES | CLONE_VM, NULL, 0, 0, 0);
-+  if (clone_pid == -1) {
-+    perror("clone");
-+    return 1;
-+  }
-   int status;
-   pid_t wait_result = waitpid(clone_pid, &status, __WCLONE);
-   if (wait_result < 0) {
---- a/src/gcc/testsuite/g++.dg/ext/arm-fp16/arm-fp16-ops-3.C
-+++ b/src/gcc/testsuite/g++.dg/ext/arm-fp16/arm-fp16-ops-3.C
-@@ -1,5 +1,6 @@
- /* Test various operators on __fp16 and mixed __fp16/float operands.  */
- /* { dg-do run { target arm*-*-* } } */
-+/* { dg-require-effective-target arm_fp16_alternative_ok } */
- /* { dg-options "-mfp16-format=alternative" } */
- 
- #include "arm-fp16-ops.h"
---- a/src/gcc/testsuite/g++.dg/ext/arm-fp16/arm-fp16-ops-4.C
-+++ b/src/gcc/testsuite/g++.dg/ext/arm-fp16/arm-fp16-ops-4.C
-@@ -1,5 +1,6 @@
- /* Test various operators on __fp16 and mixed __fp16/float operands.  */
- /* { dg-do run { target arm*-*-* } } */
-+/* { dg-require-effective-target arm_fp16_alternative_ok } */
- /* { dg-options "-mfp16-format=alternative -ffast-math" } */
- 
- #include "arm-fp16-ops.h"
---- a/src/gcc/testsuite/g++.dg/ext/arm-fp16/fp16-param-1.C
-+++ b/src/gcc/testsuite/g++.dg/ext/arm-fp16/fp16-param-1.C
-@@ -1,10 +1,14 @@
- /* { dg-do compile { target arm*-*-* } } */
- /* { dg-options "-mfp16-format=ieee" } */
- 
--/* Functions cannot have parameters of type __fp16.  */
--extern void f (__fp16);		/* { dg-error "parameters cannot have __fp16 type" } */
--extern void (*pf) (__fp16);	/* { dg-error "parameters cannot have __fp16 type" } */
-+/* Test that the ACLE macro is defined.  */
-+#if __ARM_FP16_ARGS != 1
-+#error Unexpected value for __ARM_FP16_ARGS
-+#endif
-+
-+/* Test that __fp16 is supported as a parameter type.  */
-+extern void f (__fp16);
-+extern void (*pf) (__fp16);
- 
--/* These should be OK.  */
- extern void g (__fp16 *);
- extern void (*pg) (__fp16 *);
---- a/src/gcc/testsuite/g++.dg/ext/arm-fp16/fp16-return-1.C
-+++ b/src/gcc/testsuite/g++.dg/ext/arm-fp16/fp16-return-1.C
-@@ -1,10 +1,9 @@
- /* { dg-do compile { target arm*-*-* } } */
- /* { dg-options "-mfp16-format=ieee" } */
- 
--/* Functions cannot return type __fp16.  */
--extern __fp16 f (void);		/* { dg-error "cannot return __fp16" } */
--extern __fp16 (*pf) (void);	/* { dg-error "cannot return __fp16" } */
-+/* Test that __fp16 is supported as a return type.  */
-+extern __fp16 f (void);
-+extern __fp16 (*pf) (void);
- 
--/* These should be OK.  */
- extern __fp16 *g (void);
- extern __fp16 *(*pg) (void);
---- a/src/gcc/testsuite/g++.dg/inherit/thunk1.C
-+++ b/src/gcc/testsuite/g++.dg/inherit/thunk1.C
-@@ -1,4 +1,5 @@
--// { dg-do run { target i?86-*-* x86_64-*-* s390*-*-* alpha*-*-* ia64-*-* sparc*-*-* } }
-+// { dg-do run { target arm*-*-* aarch64*-*-* i?86-*-* x86_64-*-* s390*-*-* alpha*-*-* ia64-*-* sparc*-*-* } }
-+// { dg-skip-if "" { arm_thumb1_ok } }
- 
- #include <stdarg.h>
- 
---- a/src/gcc/testsuite/g++.dg/lto/pr69589_0.C
-+++ b/src/gcc/testsuite/g++.dg/lto/pr69589_0.C
-@@ -1,6 +1,8 @@
- // { dg-lto-do link }
--// { dg-lto-options "-O2 -rdynamic" } 
-+// { dg-lto-options "-O2 -rdynamic" }
- // { dg-extra-ld-options "-r -nostdlib" }
-+// { dg-skip-if "Skip targets without -rdynamic support" { arm*-none-eabi aarch64*-*-elf } { "*" } { "" } }
-+
- #pragma GCC visibility push(hidden)
- struct A { int &operator[] (long); };
- template <typename> struct B;
---- /dev/null
-+++ b/src/gcc/testsuite/g++.dg/opt/pr78201.C
-@@ -0,0 +1,13 @@
-+// PR middle-end/78201
-+// { dg-do compile }
-+// { dg-options "-O2" }
-+
-+struct B { long d (); } *c;
-+long e;
-+
-+void
-+foo ()
-+{
-+  char a[e] = "";
-+  c && c->d();
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.c-torture/compile/pr71112.c
-@@ -0,0 +1,10 @@
-+/* PR target/71112.  */
-+/* { dg-additional-options "-fpie" { target pie } } */
-+
-+extern int dbs[100];
-+void f (int *);
-+int nscd_init (void)
-+{
-+  f (dbs);
-+  return 0;
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.c-torture/compile/pr71295.c
-@@ -0,0 +1,12 @@
-+extern void fn2 (long long);
-+int a;
-+
-+void
-+fn1 ()
-+{
-+  long long b[3];
-+  a = 0;
-+  for (; a < 3; a++)
-+    b[a] = 1;
-+  fn2 (b[1]);
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.c-torture/compile/pr78362.c
-@@ -0,0 +1,11 @@
-+/* PR target/78362.  */
-+
-+long a;
-+
-+void
-+foo (void)
-+{
-+  for (;; a--)
-+    if ((int) a)
-+      break;
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.c-torture/compile/pr78694.c
-@@ -0,0 +1,118 @@
-+/* PR target/78694.  */
-+
-+enum
-+{
-+  MEMMODEL_RELAXED,
-+  MEMMODEL_ACQUIRE,
-+  PRIORITY_INSERT_END
-+};
-+enum
-+{
-+  PQ_CHILDREN,
-+  PQ_TASKGROUP
-+};
-+struct gomp_team_state
-+{
-+  struct gomp_team *team;
-+};
-+enum gomp_task_kind
-+{
-+  GOMP_TASK_UNDEFERRED,
-+  GOMP_TASK_WAITING
-+};
-+struct gomp_taskwait
-+{
-+  _Bool in_taskwait;
-+};
-+struct gomp_task
-+{
-+  struct gomp_task *parent;
-+  int children_queue;
-+  struct gomp_taskgroup *taskgroup;
-+  int dependers;
-+  struct gomp_taskwait taskwait;
-+  enum gomp_task_kind kind;
-+  _Bool in_tied_task;
-+} j, q, *n;
-+struct gomp_taskgroup
-+{
-+  _Bool in_taskgroup_wait;
-+  int num_children;
-+} l;
-+struct gomp_team
-+{
-+  int task_queue;
-+  int task_running_count;
-+};
-+struct gomp_thread
-+{
-+  struct gomp_team_state ts;
-+  struct gomp_task task;
-+} extern __thread a;
-+
-+int b, c, d, e, f, g, h, i, k, m, o, p, r;
-+
-+void priority_queue_next_task (struct gomp_task *, int, int);
-+int gomp_task_run_pre (struct gomp_task *, struct gomp_task, struct gomp_team);
-+void priority_queue_insert (int, struct gomp_task);
-+void priority_queue_insert2 (int, struct gomp_task, int, int, int);
-+void priority_queue_insert3 (int, struct gomp_task, int, int, int);
-+void gomp_sem_post (int);
-+void free (void *);
-+
-+_Bool s;
-+int
-+GOMP_taskgroup_end ()
-+{
-+  struct gomp_thread *t = &a;
-+  struct gomp_team u = *t->ts.team;
-+  struct gomp_task *v = &t->task, *w;
-+  if (__atomic_load_n (&l.num_children, MEMMODEL_ACQUIRE))
-+    while (1)
-+      {
-+	if (l.num_children)
-+	  priority_queue_next_task (v, u.task_queue, r);
-+	else if (w)
-+	  free (w);
-+	if (n->kind == GOMP_TASK_WAITING)
-+	  {
-+	    s = gomp_task_run_pre (n, q, u);
-+	    if (__builtin_expect (s, 0))
-+	      {
-+		if (w)
-+		  free (w);
-+		goto finish_cancelled;
-+	      }
-+	    n = 0;
-+	    l.in_taskgroup_wait = 1;
-+	  }
-+	if (w)
-+	  {
-+	    t->task = *n;
-+	    if (__builtin_expect (p, 0))
-+	      if (o)
-+		t->task = *v;
-+	  }
-+	if (n)
-+	  {
-+	    struct gomp_task x = x;
-+	    for (; i; b++)
-+	      {
-+		struct gomp_task y = j;
-+		if (g)
-+		  continue;
-+		priority_queue_insert (PQ_CHILDREN, x);
-+		if (x.taskwait.in_taskwait)
-+		  priority_queue_insert2 (PQ_TASKGROUP, y, e, 0, d);
-+		if (h)
-+		  gomp_sem_post (f);
-+		priority_queue_insert3 (k, y, PRIORITY_INSERT_END, 0, d);
-+		++c;
-+	      }
-+	  }
-+      finish_cancelled:
-+	w = (struct gomp_task *) (n - u.task_running_count - v);
-+      }
-+  v->taskgroup = (struct gomp_taskgroup *) m;
-+  return 1;
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.c-torture/execute/pr37780.c
-@@ -0,0 +1,49 @@
-+/* PR middle-end/37780.  */
-+
-+#define VAL (8 * sizeof (int))
-+
-+int __attribute__ ((noinline, noclone))
-+fooctz (int i)
-+{
-+  return (i == 0) ? VAL : __builtin_ctz (i);
-+}
-+
-+int __attribute__ ((noinline, noclone))
-+fooctz2 (int i)
-+{
-+  return (i != 0) ? __builtin_ctz (i) : VAL;
-+}
-+
-+unsigned int __attribute__ ((noinline, noclone))
-+fooctz3 (unsigned int i)
-+{
-+  return (i > 0) ?  __builtin_ctz (i) : VAL;
-+}
-+
-+int __attribute__ ((noinline, noclone))
-+fooclz (int i)
-+{
-+  return (i == 0) ? VAL : __builtin_clz (i);
-+}
-+
-+int __attribute__ ((noinline, noclone))
-+fooclz2 (int i)
-+{
-+  return (i != 0) ? __builtin_clz (i) : VAL;
-+}
-+
-+unsigned int __attribute__ ((noinline, noclone))
-+fooclz3 (unsigned int i)
-+{
-+  return (i > 0) ? __builtin_clz (i) : VAL;
-+}
-+
-+int
-+main (void)
-+{
-+  if (fooctz (0) != VAL || fooctz2 (0) != VAL || fooctz3 (0) != VAL
-+      || fooclz (0) != VAL || fooclz2 (0) != VAL || fooclz3 (0) != VAL)
-+    __builtin_abort ();
-+
-+  return 0;
-+}
-\ No newline at end of file
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.c-torture/execute/pr66940.c
-@@ -0,0 +1,20 @@
-+long long __attribute__ ((noinline, noclone))
-+foo (long long ival)
-+{
-+ if (ival <= 0)
-+    return -0x7fffffffffffffffL - 1;
-+
-+ return 0x7fffffffffffffffL;
-+}
-+
-+int
-+main (void)
-+{
-+  if (foo (-1) != (-0x7fffffffffffffffL - 1))
-+    __builtin_abort ();
-+
-+  if (foo (1) != 0x7fffffffffffffffL)
-+    __builtin_abort ();
-+
-+  return 0;
-+}
---- a/src/gcc/testsuite/gcc.dg/asr_div1.c
-+++ b/src/gcc/testsuite/gcc.dg/asr_div1.c
-@@ -1,6 +1,7 @@
- /* Test division by const int generates only one shift.  */
- /* { dg-do run } */
- /* { dg-options "-O2 -fdump-rtl-combine-all" } */
-+/* { dg-options "-O2 -fdump-rtl-combine-all -mtune=cortex-a53" { target aarch64*-*-* } } */
- 
- extern void abort (void);
- 
---- a/src/gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-5.c
-+++ b/src/gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-5.c
-@@ -24,7 +24,7 @@
- 			 | FE_OVERFLOW		\
- 			 | FE_UNDERFLOW)
- 
--#if defined __alpha__
-+#if defined __alpha__ || defined __aarch64__
-   #define ITER_COUNT 100
- #else
-   #define ITER_COUNT 10000
---- a/src/gcc/testsuite/gcc.dg/cpp/trad/include.c
-+++ b/src/gcc/testsuite/gcc.dg/cpp/trad/include.c
-@@ -2,11 +2,5 @@
- 
- /* Test that macros are not expanded in the <> quotes of #inlcude.  */
- 
--/* vxWorksCommon.h uses the "#" operator to construct the name of an
--   include file, thus making the file incompatible with -traditional-cpp.
--   Newlib uses ## when including stdlib.h as of 2007-09-07.  */
--/* { dg-do preprocess { target { { ! vxworks_kernel } && { ! newlib } } } } */
--
--#define __STDC__ 1		/* Stop complaints about non-ISO compilers.  */
--#define stdlib 1
--#include <stdlib.h>		/* { dg-bogus "o such file or directory" } */
-+#define builtins 1
-+#include <builtins.h>		/* { dg-bogus "o such file or directory" } */
---- a/src/gcc/testsuite/gcc.dg/cpp/trad/trad.exp
-+++ b/src/gcc/testsuite/gcc.dg/cpp/trad/trad.exp
-@@ -29,7 +29,7 @@ load_lib gcc-dg.exp
- # If a testcase doesn't have special options, use these.
- global DEFAULT_TRADCPPFLAGS
- if ![info exists DEFAULT_TRADCPPFLAGS] then {
--    set DEFAULT_TRADCPPFLAGS " -traditional-cpp"
-+    set DEFAULT_TRADCPPFLAGS " -traditional-cpp -I$srcdir/$subdir/"
- }
- 
- # Initialize `dg'.
---- a/src/gcc/testsuite/gcc.dg/cpp/warn-undef-2.c
-+++ b/src/gcc/testsuite/gcc.dg/cpp/warn-undef-2.c
-@@ -1,5 +1,5 @@
- // { dg-do preprocess }
- // { dg-options "-std=gnu99 -fdiagnostics-show-option -Werror=undef" }
- /* { dg-message "some warnings being treated as errors" "" {target "*-*-*"} 0 } */
--#if x  // { dg-error "\"x\" is not defined .-Werror=undef." }
-+#if x  // { dg-error "\"x\" is not defined, evaluates to 0 .-Werror=undef." }
- #endif
---- a/src/gcc/testsuite/gcc.dg/cpp/warn-undef.c
-+++ b/src/gcc/testsuite/gcc.dg/cpp/warn-undef.c
-@@ -1,5 +1,5 @@
- // { dg-do preprocess }
- // { dg-options "-std=gnu99 -fdiagnostics-show-option -Wundef" }
- 
--#if x  // { dg-warning "\"x\" is not defined .-Wundef." }
-+#if x  // { dg-warning "\"x\" is not defined, evaluates to 0 .-Wundef." }
- #endif
---- a/src/gcc/testsuite/gcc.dg/lto/pr54709_0.c
-+++ b/src/gcc/testsuite/gcc.dg/lto/pr54709_0.c
-@@ -1,6 +1,7 @@
- /* { dg-lto-do link } */
- /* { dg-require-visibility "hidden" } */
- /* { dg-require-effective-target fpic } */
-+/* { dg-require-effective-target shared } */
- /* { dg-extra-ld-options { -shared } } */
- /* { dg-lto-options { { -fPIC -fvisibility=hidden -flto } } } */
- 
---- a/src/gcc/testsuite/gcc.dg/lto/pr61526_0.c
-+++ b/src/gcc/testsuite/gcc.dg/lto/pr61526_0.c
-@@ -1,4 +1,5 @@
- /* { dg-require-effective-target fpic } */
-+/* { dg-require-effective-target shared } */
- /* { dg-lto-do link } */
- /* { dg-lto-options { { -fPIC -flto -flto-partition=1to1 } } } */
- /* { dg-extra-ld-options { -shared } } */
---- a/src/gcc/testsuite/gcc.dg/lto/pr64415_0.c
-+++ b/src/gcc/testsuite/gcc.dg/lto/pr64415_0.c
-@@ -1,5 +1,6 @@
- /* { dg-lto-do link } */
- /* { dg-require-effective-target fpic } */
-+/* { dg-require-effective-target shared } */
- /* { dg-lto-options { { -O -flto -fpic } } } */
- /* { dg-extra-ld-options { -shared } } */
- /* { dg-extra-ld-options "-Wl,-undefined,dynamic_lookup" { target *-*-darwin* } } */
---- a/src/gcc/testsuite/gcc.dg/plugin/plugin.exp
-+++ b/src/gcc/testsuite/gcc.dg/plugin/plugin.exp
-@@ -87,6 +87,12 @@ foreach plugin_test $plugin_test_list {
-     if ![runtest_file_p $runtests $plugin_src] then {
-         continue
-     }
-+    # Skip tail call tests on targets that do not have sibcall_epilogue.
-+    if {[regexp ".*must_tail_call_plugin.c" $plugin_src]
-+	&& [istarget arm*-*-*]
-+	&& [check_effective_target_arm_thumb1]} then {
-+	continue
-+    }
-     set plugin_input_tests [lreplace $plugin_test 0 0]
-     plugin-test-execute $plugin_src $plugin_input_tests
- }
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.dg/pr59833.c
-@@ -0,0 +1,18 @@
-+/* { dg-do run { target { *-*-linux* *-*-gnu* } } } */
-+/* { dg-options "-O0 -lm" } */
-+/* { dg-require-effective-target issignaling } */
-+
-+#define _GNU_SOURCE
-+#include <math.h>
-+
-+int main (void)
-+{
-+  float sNaN = __builtin_nansf ("");
-+  double x = (double) sNaN;
-+  if (issignaling(x))
-+  {
-+    __builtin_abort();
-+  }
-+
-+  return 0;
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.dg/pr68217.c
-@@ -0,0 +1,14 @@
-+
-+/* { dg-do compile } */
-+/* { dg-options "-O2 -fdump-tree-vrp1" } */
-+
-+int foo (void)
-+{
-+    volatile int a = -1;
-+    long long b = (1LL << (sizeof (b) * 8 - 1)); // LLONG_MIN
-+    long long x = (a & b); // x == 0x8000000000000000
-+    if (x < 1LL) { ; } else { __builtin_abort(); }
-+    return 0;
-+}
-+
-+/* { dg-final { scan-tree-dump "\\\[-INF, 0\\\]" "vrp1" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.dg/pr71636-1.c
-@@ -0,0 +1,9 @@
-+/* { dg-do compile } */
-+/* { dg-options "-fdump-tree-gimple" } */
-+
-+unsigned f(unsigned x, unsigned b)
-+{
-+  return x & ((1U << b) - 1);
-+}
-+
-+/* { dg-final { scan-tree-dump-not "1 <<" "gimple" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.dg/pr71636-2.c
-@@ -0,0 +1,12 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2 -fdump-tree-forwprop-details" } */
-+
-+unsigned f(unsigned x, unsigned b)
-+{
-+  unsigned t1 = 1U << b;
-+  unsigned t2 = t1 - 1;
-+  unsigned t3 = x & t2;
-+  return t3;
-+}
-+
-+/* { dg-final { scan-tree-dump "_\[0-9\] = ~_\[0-9\]" "forwprop1" } } */
---- a/src/gcc/testsuite/gcc.dg/strlenopt-20.c
-+++ b/src/gcc/testsuite/gcc.dg/strlenopt-20.c
-@@ -86,9 +86,9 @@ main ()
-   return 0;
- }
- 
--/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
-+/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "memcpy \\(" 4 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
--/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
-+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
---- a/src/gcc/testsuite/gcc.dg/strlenopt-21.c
-+++ b/src/gcc/testsuite/gcc.dg/strlenopt-21.c
-@@ -57,9 +57,9 @@ main ()
-   return 0;
- }
- 
--/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
-+/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "memcpy \\(" 3 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
--/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
-+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
---- a/src/gcc/testsuite/gcc.dg/strlenopt-22.c
-+++ b/src/gcc/testsuite/gcc.dg/strlenopt-22.c
-@@ -31,9 +31,9 @@ main ()
-   return 0;
- }
- 
--/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */
-+/* { dg-final { scan-tree-dump-times "strlen \\(" 4 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "memcpy \\(" 1 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
--/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
-+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
---- a/src/gcc/testsuite/gcc.dg/strlenopt-22g.c
-+++ b/src/gcc/testsuite/gcc.dg/strlenopt-22g.c
-@@ -5,9 +5,9 @@
- #define USE_GNU
- #include "strlenopt-22.c"
- 
--/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */
-+/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "memcpy \\(" 1 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
--/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
-+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "stpcpy \\(" 1 "strlen" } } */
---- a/src/gcc/testsuite/gcc.dg/strlenopt-26.c
-+++ b/src/gcc/testsuite/gcc.dg/strlenopt-26.c
-@@ -21,4 +21,5 @@ main (void)
-   return 0;
- }
- 
--/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
-+/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
-+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
---- a/src/gcc/testsuite/gcc.dg/strlenopt-5.c
-+++ b/src/gcc/testsuite/gcc.dg/strlenopt-5.c
-@@ -48,9 +48,9 @@ main ()
-   return 0;
- }
- 
--/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */
-+/* { dg-final { scan-tree-dump-times "strlen \\(" 2 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "memcpy \\(" 2 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
--/* { dg-final { scan-tree-dump-times "strchr \\(" 2 "strlen" } } */
-+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
---- a/src/gcc/testsuite/gcc.dg/strlenopt-7.c
-+++ b/src/gcc/testsuite/gcc.dg/strlenopt-7.c
-@@ -40,11 +40,11 @@ main ()
-   return 0;
- }
- 
--/* { dg-final { scan-tree-dump-times "strlen \\(" 0 "strlen" } } */
-+/* { dg-final { scan-tree-dump-times "strlen \\(" 1 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "memcpy \\(" 2 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "strcpy \\(" 0 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
--/* { dg-final { scan-tree-dump-times "strchr \\(" 1 "strlen" } } */
-+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "\\*r_\[0-9\]* = 0;" 1 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "return 3;" 1 "optimized" } } */
---- a/src/gcc/testsuite/gcc.dg/strlenopt-9.c
-+++ b/src/gcc/testsuite/gcc.dg/strlenopt-9.c
-@@ -98,10 +98,10 @@ main ()
-   return 0;
- }
- 
--/* { dg-final { scan-tree-dump-times "strlen \\(" 3 "strlen" } } */
-+/* { dg-final { scan-tree-dump-times "strlen \\(" 5 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "memcpy \\(" 6 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "strcpy \\(" 1 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "strcat \\(" 0 "strlen" } } */
--/* { dg-final { scan-tree-dump-times "strchr \\(" 3 "strlen" } } */
-+/* { dg-final { scan-tree-dump-times "strchr \\(" 0 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "stpcpy \\(" 0 "strlen" } } */
- /* { dg-final { scan-tree-dump-times "return 4;" 1 "optimized" } } */
---- a/src/gcc/testsuite/gcc.dg/torture/arm-fp16-int-convert-alt.c
-+++ b/src/gcc/testsuite/gcc.dg/torture/arm-fp16-int-convert-alt.c
-@@ -1,5 +1,6 @@
- /* Test floating-point conversions.  Standard types and __fp16.  */
- /* { dg-do run { target arm*-*-* } } */
-+/* { dg-require-effective-target arm_fp16_alternative_ok }
- /* { dg-options "-mfp16-format=alternative" } */
- 
- #include "fp-int-convert.h"
---- a/src/gcc/testsuite/gcc.dg/torture/arm-fp16-ops-3.c
-+++ b/src/gcc/testsuite/gcc.dg/torture/arm-fp16-ops-3.c
-@@ -1,5 +1,6 @@
- /* Test various operators on __fp16 and mixed __fp16/float operands.  */
- /* { dg-do run { target arm*-*-* } } */
-+/* { dg-require-effective-target arm_fp16_alternative_ok }
- /* { dg-options "-mfp16-format=alternative" } */
- 
- #include "arm-fp16-ops.h"
---- a/src/gcc/testsuite/gcc.dg/torture/arm-fp16-ops-4.c
-+++ b/src/gcc/testsuite/gcc.dg/torture/arm-fp16-ops-4.c
-@@ -1,5 +1,6 @@
- /* Test various operators on __fp16 and mixed __fp16/float operands.  */
- /* { dg-do run { target arm*-*-* } } */
-+/* { dg-require-effective-target arm_fp16_alternative_ok }
- /* { dg-options "-mfp16-format=alternative -ffast-math" } */
- 
- #include "arm-fp16-ops.h"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.dg/torture/pr71594.c
-@@ -0,0 +1,15 @@
-+/* { dg-do compile } */
-+/* { dg-options "--param max-rtl-if-conversion-insns=2" } */
-+
-+unsigned short a;
-+int b, c;
-+int *d;
-+void fn1() {
-+  *d = 24;
-+  for (; *d <= 65;) {
-+    unsigned short *e = &a;
-+    b = (a &= 0 <= 0) < (c ?: (*e %= *d));
-+    for (; *d <= 83;)
-+      ;
-+  }
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.dg/tree-ssa/pr61839_1.c
-@@ -0,0 +1,44 @@
-+/* PR tree-optimization/61839.  */
-+/* { dg-do run } */
-+/* { dg-options "-O2 -fdump-tree-vrp1 -fdump-tree-optimized" } */
-+/* { dg-require-effective-target int32plus } */
-+
-+__attribute__ ((noinline))
-+int foo ()
-+{
-+  int a = -1;
-+  volatile unsigned b = 1U;
-+  int c = 1;
-+  c = (a + 972195718) >> (1LU <= b);
-+  if (c == 486097858)
-+    ;
-+  else
-+    __builtin_abort ();
-+  return 0;
-+}
-+
-+__attribute__ ((noinline))
-+int bar ()
-+{
-+  int a = -1;
-+  volatile unsigned b = 1U;
-+  int c = 1;
-+  c = (a + 972195718) >> (b ? 2 : 3);
-+  if (c == 243048929)
-+    ;
-+  else
-+    __builtin_abort ();
-+  return 0;
-+}
-+
-+int main ()
-+{
-+  foo ();
-+  bar ();
-+}
-+
-+/* Scan for c = 972195717) >> [0, 1] in function foo.  */
-+/* { dg-final { scan-tree-dump-times "486097858 : 972195717" 1  "vrp1" } } */
-+/* Scan for c = 972195717) >> [2, 3] in function bar.  */
-+/* { dg-final { scan-tree-dump-times "243048929 : 121524464" 2  "vrp1" } } */
-+/* { dg-final { scan-tree-dump-times "486097858" 0  "optimized" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.dg/tree-ssa/pr61839_2.c
-@@ -0,0 +1,54 @@
-+/* PR tree-optimization/61839.  */
-+/* { dg-do compile } */
-+/* { dg-options "-O2 -fdump-tree-vrp1" } */
-+/* { dg-require-effective-target int32plus } */
-+
-+__attribute__ ((noinline))
-+int foo ()
-+{
-+  int a = -1;
-+  volatile unsigned b = 1U;
-+  int c = 1;
-+  c = (a + 972195718) / (b ? 1 : 0);
-+  if (c == 972195717)
-+    ;
-+  else
-+    __builtin_abort ();
-+  return 0;
-+}
-+
-+__attribute__ ((noinline))
-+int bar ()
-+{
-+  int a = -1;
-+  volatile unsigned b = 1U;
-+  int c = 1;
-+  c = (a + 972195718) % (b ? 1 : 0);
-+  if (c == 972195717)
-+    ;
-+  else
-+    __builtin_abort ();
-+  return 0;
-+}
-+
-+__attribute__ ((noinline))
-+int bar2 ()
-+{
-+  int a = -1;
-+  volatile unsigned b = 1U;
-+  int c = 1;
-+  c = (a + 972195716) % (b ? 1 : 2);
-+  if (c == 972195715)
-+    ;
-+  else
-+    __builtin_abort ();
-+  return 0;
-+}
-+
-+
-+/* Dont optimize 972195717 / 0 in function foo.  */
-+/* { dg-final { scan-tree-dump-times "972195717 / _" 1  "vrp1" } } */
-+/* Dont optimize 972195717 % 0 in function bar.  */
-+/* { dg-final { scan-tree-dump-times "972195717 % _" 1 "vrp1" } } */
-+/* Optimize in function bar2.  */
-+/* { dg-final { scan-tree-dump-times "972195715 % _" 0 "vrp1" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.dg/tree-ssa/pr61839_3.c
-@@ -0,0 +1,26 @@
-+/* PR tree-optimization/61839.  */
-+/* { dg-do run } */
-+/* { dg-options "-O2 -fdump-tree-vrp1 -fdump-tree-optimized" } */
-+
-+__attribute__ ((noinline))
-+int foo (int a, unsigned b)
-+{
-+  int c = 1;
-+  b =  a ? 12 : 13;
-+  c = b << 8;
-+  if (c == 3072)
-+    ;
-+  else
-+    __builtin_abort ();
-+  return 0;
-+}
-+
-+int main ()
-+{
-+  volatile unsigned b = 1U;
-+  foo (-1, b);
-+}
-+
-+/* Scan for c [12, 13] << 8 in function foo.  */
-+/* { dg-final { scan-tree-dump-times "3072 : 3328" 2  "vrp1" } } */
-+/* { dg-final { scan-tree-dump-times "3072" 0  "optimized" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.dg/tree-ssa/pr61839_4.c
-@@ -0,0 +1,28 @@
-+/* PR tree-optimization/61839.  */
-+/* { dg-do run } */
-+/* { dg-options "-O2 -fdump-tree-vrp1 -fdump-tree-optimized" } */
-+/* { dg-require-effective-target int32plus } */
-+
-+__attribute__ ((noinline))
-+int foo (int a, unsigned b)
-+{
-+  unsigned c = 1;
-+  if (b >= 1 && b <= ((unsigned)(-1) - 1))
-+    return 0;
-+  c = b >> 4;
-+  if (c == 268435455)
-+    ;
-+  else
-+    __builtin_abort ();
-+  return 0;
-+}
-+
-+int main ()
-+{
-+  volatile unsigned b = (unsigned)(-1);
-+  foo (-1, b);
-+}
-+
-+/* Scan for ~[1, 4294967294] >> 4 in function foo.  */
-+/* { dg-final { scan-tree-dump-times "0 : 268435455" 1  "vrp1" } } */
-+/* { dg-final { scan-tree-dump-times "268435455" 0  "optimized" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.dg/tree-ssa/scev-11.c
-@@ -0,0 +1,28 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2 -fdump-tree-ivopts-details" } */
-+
-+int a[128];
-+extern int b[];
-+
-+int bar (int *);
-+
-+int
-+foo (int n)
-+{
-+  int i;
-+
-+  for (i = 0; i < n; i++)
-+    {
-+      unsigned char uc = (unsigned char)i;
-+      a[i] = i;
-+      b[uc] = 0;
-+    }
-+
-+  bar (a);
-+  return 0;
-+}
-+
-+/* Address of array reference to b is scev.  */
-+/* { dg-final { scan-tree-dump-times "use \[0-9\]\n  address" 2 "ivopts" } } */
-+
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.dg/tree-ssa/scev-12.c
-@@ -0,0 +1,30 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2 -fdump-tree-ivopts-details" } */
-+
-+int a[128];
-+extern int b[];
-+
-+int bar (int *);
-+
-+int
-+foo (int x, int n)
-+{
-+  int i;
-+
-+  for (i = 0; i < n; i++)
-+    {
-+      unsigned char uc = (unsigned char)i;
-+      if (x)
-+	a[i] = i;
-+      b[uc] = 0;
-+    }
-+
-+  bar (a);
-+  return 0;
-+}
-+
-+/* Address of array reference to b is not scev.  */
-+/* { dg-final { scan-tree-dump-times "use \[0-9\]\n  address" 1 "ivopts" } } */
-+
-+
-+
---- a/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-2.c
-+++ b/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-2.c
-@@ -25,6 +25,7 @@ f1 (int i, ...)
- /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save 0 GPR units and 0 FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
- /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save 0 GPR units and 0 FPR units" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save 0 GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save 0 GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save 0 GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save 0 GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save 0 GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -45,6 +46,7 @@ f2 (int i, ...)
- /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save \[148\] GPR units and 0 FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
- /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save 8 GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save 1 GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save 8 GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -60,6 +62,7 @@ f3 (int i, ...)
- /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 0 GPR units and \[1-9\]\[0-9\]* FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
- /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 0 GPR units and \[1-9\]\[0-9\]* FPR units" "stdarg" { target { powerpc*-*-linux* && { powerpc_fprs && ilp32 } } } } } */
- /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 0 GPR units and 1 FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 0 GPR units and 16 FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 8 GPR units and 2" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save \[1-9\]\[0-9\]* GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save \[1-9\]\[0-9\]* GPR units" "stdarg" { target ia64-*-* } } } */
-@@ -78,6 +81,7 @@ f4 (int i, ...)
- /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
- /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -96,6 +100,7 @@ f5 (int i, ...)
- /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
- /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -116,6 +121,7 @@ f6 (int i, ...)
- /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save (3|12|24) GPR units and 0 FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
- /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save 24 GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save 3 GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save 24 GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save (3|12|24) GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save (3|12|24) GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save (3|12|24) GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -133,6 +139,7 @@ f7 (int i, ...)
- /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
- /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -152,6 +159,7 @@ f8 (int i, ...)
- /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
- /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -169,6 +177,7 @@ f9 (int i, ...)
- /* { dg-final { scan-tree-dump "f9: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
- /* { dg-final { scan-tree-dump "f9: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f9: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f9: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f9: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f9: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f9: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -188,6 +197,7 @@ f10 (int i, ...)
- /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
- /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -208,6 +218,7 @@ f11 (int i, ...)
- /* { dg-final { scan-tree-dump "f11: va_list escapes 0, needs to save (3|12|24) GPR units and 0 FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
- /* { dg-final { scan-tree-dump "f11: va_list escapes 0, needs to save 24 GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f11: va_list escapes 0, needs to save 3 GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f11: va_list escapes 0, needs to save 24 GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f11: va_list escapes 0, needs to save (3|12|24) GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f11: va_list escapes 0, needs to save (3|12|24) GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f11: va_list escapes 0, needs to save (3|12|24) GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -228,6 +239,7 @@ f12 (int i, ...)
- /* { dg-final { scan-tree-dump "f12: va_list escapes 0, needs to save 0 GPR units and \[1-9\]\[0-9\]* FPR units" "stdarg" { target { powerpc*-*-linux* && { powerpc_fprs && ilp32 } } } } } */
- /* { dg-final { scan-tree-dump "f12: va_list escapes 0, needs to save 24 GPR units and 2" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f12: va_list escapes 0, needs to save 0 GPR units and 3 FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f12: va_list escapes 0, needs to save 0 GPR units and 48 FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f12: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f12: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f12: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -248,6 +260,7 @@ f13 (int i, ...)
- /* { dg-final { scan-tree-dump "f13: va_list escapes 0, needs to save 0 GPR units and \[1-9\]\[0-9\]* FPR units" "stdarg" { target { powerpc*-*-linux* && { powerpc_fprs && ilp32 } } } } } */
- /* { dg-final { scan-tree-dump "f13: va_list escapes 0, needs to save 24 GPR units and 2" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f13: va_list escapes 0, needs to save 0 GPR units and 3 FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f13: va_list escapes 0, needs to save 0 GPR units and 48 FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f13: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f13: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f13: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -268,6 +281,7 @@ f14 (int i, ...)
- /* { dg-final { scan-tree-dump "f14: va_list escapes 0, needs to save \[148\] GPR units and \[1-9\]\[0-9\]* FPR units" "stdarg" { target { powerpc*-*-linux* && { powerpc_fprs && ilp32 } } } } } */
- /* { dg-final { scan-tree-dump "f14: va_list escapes 0, needs to save 24 GPR units and 3" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f14: va_list escapes 0, needs to save 1 GPR units and 2 FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f14: va_list escapes 0, needs to save 8 GPR units and 32 FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f14: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f14: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f14: va_list escapes 0, needs to save \[1-9]\[0-9\]* GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -291,6 +305,7 @@ f15 (int i, ...)
- /* { dg-final { scan-tree-dump "f15: va_list escapes 0, needs to save \[148\] GPR units and \[1-9\]\[0-9\]* FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
- /* { dg-final { scan-tree-dump "f15: va_list escapes 0, needs to save \[148\] GPR units and \[1-9\]\[0-9\]* FPR units" "stdarg" { target { powerpc*-*-linux* && { powerpc_fprs && ilp32 } } } } } */
- /* { dg-final { scan-tree-dump "f15: va_list escapes 0, needs to save 1 GPR units and 2 FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f15: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
- 
- /* We may be able to improve upon this after fixing PR66010/PR66013.  */
- /* { dg-final { scan-tree-dump "f15: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
---- a/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-3.c
-+++ b/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-3.c
-@@ -24,6 +24,7 @@ f1 (int i, ...)
- /* { dg-final { scan-tree-dump "f1: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
- /* { dg-final { scan-tree-dump "f1: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f1: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f1: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f1: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f1: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f1: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -39,6 +40,7 @@ f2 (int i, ...)
- /* { dg-final { scan-tree-dump "f2: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
- /* { dg-final { scan-tree-dump "f2: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f2: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f2: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f2: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f2: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f2: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -57,6 +59,7 @@ f3 (int i, ...)
- /* { dg-final { scan-tree-dump "f3: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
- /* { dg-final { scan-tree-dump "f3: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f3: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f3: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f3: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f3: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f3: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -73,6 +76,7 @@ f4 (int i, ...)
- /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
- /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f4: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -89,6 +93,7 @@ f5 (int i, ...)
- /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
- /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f5: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -107,6 +112,7 @@ f6 (int i, ...)
- /* { dg-final { scan-tree-dump "f6: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
- /* { dg-final { scan-tree-dump "f6: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f6: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f6: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f6: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f6: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f6: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -123,6 +129,7 @@ f7 (int i, ...)
- /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
- /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f7: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -139,6 +146,7 @@ f8 (int i, ...)
- /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
- /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f8: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -155,6 +163,7 @@ f10 (int i, ...)
- /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
- /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f10: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -171,6 +180,7 @@ f11 (int i, ...)
- /* { dg-final { scan-tree-dump "f11: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
- /* { dg-final { scan-tree-dump "f11: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f11: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f11: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f11: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f11: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f11: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -187,6 +197,7 @@ f12 (int i, ...)
- /* { dg-final { scan-tree-dump "f12: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
- /* { dg-final { scan-tree-dump "f12: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f12: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f12: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f12: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f12: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f12: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
---- a/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-4.c
-+++ b/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-4.c
-@@ -27,6 +27,7 @@ f1 (int i, ...)
- /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save all GPR units and 0 FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
- /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save all GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save all GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save all GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f1: va_list escapes \[01\], needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f1: va_list escapes \[01\], needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f1: va_list escapes \[01\], needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -44,6 +45,7 @@ f2 (int i, ...)
- /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save 0 GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && { powerpc_fprs && ilp32 } } } } } */
- /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save all GPR units and 2" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save 0 GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save 0 GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f2: va_list escapes \[01\], needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f2: va_list escapes \[01\], needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f2: va_list escapes \[01\], needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -67,6 +69,7 @@ f3 (int i, ...)
- /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save \[148\] GPR units and 0 FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
- /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 8 GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 1 GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 8 GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
-@@ -88,6 +91,7 @@ f4 (int i, ...)
- /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save 0 GPR units and \[1-9\]\[0-9\]* FPR units" "stdarg" { target { powerpc*-*-linux* && { powerpc_fprs && ilp32 } } } } } */
- /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save 8 GPR units and 2" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save 0 GPR units and 1 FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save 0 GPR units and 16 FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save \[148\] GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
---- a/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-5.c
-+++ b/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-5.c
-@@ -25,6 +25,7 @@ f1 (int i, ...)
- /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save 0 GPR units and 0 FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
- /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save all GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save all GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f1: va_list escapes 0, needs to save all GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
- 
- void
- f2 (int i, ...)
-@@ -38,6 +39,7 @@ f2 (int i, ...)
- /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save all GPR units and all FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
- /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save all GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save all GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f2: va_list escapes 0, needs to save all GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
- 
- /* Here va_arg can be executed at most as many times as va_start.  */
- void
-@@ -56,6 +58,7 @@ f3 (int i, ...)
- /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 0 GPR units and 0 FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
- /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 32 GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 1 GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f3: va_list escapes 0, needs to save 8 GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
- 
- void
- f4 (int i, ...)
-@@ -74,6 +77,7 @@ f4 (int i, ...)
- /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save 16 GPR units and 16 FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
- /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save 24 GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save 2 GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f4: va_list escapes 0, needs to save 24 GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
- 
- void
- f5 (int i, ...)
-@@ -88,6 +92,7 @@ f5 (int i, ...)
- /* { dg-final { scan-tree-dump "f5: va_list escapes 0, needs to save 16 GPR units and 0 FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
- /* { dg-final { scan-tree-dump "f5: va_list escapes 0, needs to save 32 GPR units and 1" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f5: va_list escapes 0, needs to save (4|2) GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f5: va_list escapes 0, needs to save 16 GPR units and 0 FPR units" "stdarg" { target aarch64*-*-* } } } */
- 
- void
- f6 (int i, ...)
-@@ -102,6 +107,7 @@ f6 (int i, ...)
- /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save 8 GPR units and 32 FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
- /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save 32 GPR units and 3" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save (3|2) GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f6: va_list escapes 0, needs to save 8 GPR units and 32 FPR units" "stdarg" { target aarch64*-*-* } } } */
- 
- void
- f7 (int i, ...)
-@@ -116,3 +122,4 @@ f7 (int i, ...)
- /* { dg-final { scan-tree-dump "f7: va_list escapes 0, needs to save 0 GPR units and 64 FPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 || llp64 } } } } } } */
- /* { dg-final { scan-tree-dump "f7: va_list escapes 0, needs to save 32 GPR units and 2" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "f7: va_list escapes 0, needs to save 2 GPR units and 0 FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "f7: va_list escapes 0, needs to save 0 GPR units and 64 FPR units" "stdarg" { target aarch64*-*-* } } } */
---- a/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-6.c
-+++ b/src/gcc/testsuite/gcc.dg/tree-ssa/stdarg-6.c
-@@ -30,6 +30,7 @@ bar (int x, char const *y, ...)
- /* { dg-final { scan-tree-dump "bar: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target { powerpc*-*-linux* && ilp32 } } } } */
- /* { dg-final { scan-tree-dump "bar: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target alpha*-*-linux* } } } */
- /* { dg-final { scan-tree-dump "bar: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target s390*-*-linux* } } } */
-+/* { dg-final { scan-tree-dump "bar: va_list escapes 1, needs to save all GPR units and all FPR units" "stdarg" { target aarch64*-*-* } } } */
- /* { dg-final { scan-tree-dump "bar: va_list escapes 1, needs to save all GPR units" "stdarg" { target { { i?86-*-* x86_64-*-* } && ia32 } } } } */
- /* { dg-final { scan-tree-dump "bar: va_list escapes 1, needs to save all GPR units" "stdarg" { target ia64-*-* } } } */
- /* { dg-final { scan-tree-dump "bar: va_list escapes 1, needs to save all GPR units" "stdarg" { target { powerpc*-*-* && lp64 } } } } */
---- a/src/gcc/testsuite/gcc.dg/uninit-pred-8_a.c
-+++ b/src/gcc/testsuite/gcc.dg/uninit-pred-8_a.c
-@@ -1,6 +1,8 @@
- 
- /* { dg-do compile } */
- /* { dg-options "-Wuninitialized -O2" } */
-+/* Pick a particular tuning to pin down BRANCH_COST.  */
-+/* { dg-additional-options "-mtune=cortex-a15" { target arm*-*-* } } */
- 
- int g;
- void bar();
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.dg/vect/aligned-section-anchors-vect-70.c
-@@ -0,0 +1,33 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target section_anchors } */
-+/* { dg-require-effective-target vect_int } */
-+
-+#define N 32
-+
-+/* Increase alignment of struct if an array's offset is multiple of alignment of
-+   vector type corresponding to it's scalar type.
-+   For the below test-case:
-+   offsetof(e) == 8 bytes. 
-+   i) For arm: let x = alignment of vector type corresponding to int,
-+   x == 8 bytes.
-+   Since offsetof(e) % x == 0, set DECL_ALIGN(a, b, c) to x.
-+   ii) For aarch64, ppc: x == 16 bytes.
-+   Since offsetof(e) % x != 0, don't increase alignment of a, b, c.
-+*/
-+
-+static struct A {
-+  int p1, p2;
-+  int e[N];
-+} a, b, c;
-+
-+int foo(void)
-+{
-+  for (int i = 0; i < N; i++)
-+    a.e[i] = b.e[i] + c.e[i];
-+
-+   return a.e[0];
-+}
-+
-+/* { dg-final { scan-ipa-dump-times "Increasing alignment of decl" 0 "increase_alignment" { target aarch64*-*-* } } } */
-+/* { dg-final { scan-ipa-dump-times "Increasing alignment of decl" 0 "increase_alignment" { target powerpc64*-*-* } } } */
-+/* { dg-final { scan-ipa-dump-times "Increasing alignment of decl" 3 "increase_alignment" { target arm*-*-* } } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.dg/vect/aligned-section-anchors-vect-71.c
-@@ -0,0 +1,25 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target section_anchors } */
-+/* { dg-require-effective-target vect_int } */
-+
-+/* Should not increase alignment of the struct because
-+   sizeof (A.e) < sizeof(corresponding vector type).  */
-+
-+#define N 3
-+
-+static struct A {
-+  int p1, p2;
-+  int e[N];
-+} a, b, c;
-+
-+int foo(void)
-+{
-+  for (int i = 0; i < N; i++)
-+    a.e[i] = b.e[i] + c.e[i];
-+
-+   return a.e[0];
-+}
-+
-+/* { dg-final { scan-ipa-dump-times "Increasing alignment of decl" 0 "increase_alignment" { target aarch64*-*-* } } } */
-+/* { dg-final { scan-ipa-dump-times "Increasing alignment of decl" 0 "increase_alignment" { target powerpc64*-*-* } } } */
-+/* { dg-final { scan-ipa-dump-times "Increasing alignment of decl" 0 "increase_alignment" { target arm*-*-* } } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.dg/vect/aligned-section-anchors-vect-72.c
-@@ -0,0 +1,29 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target section_anchors } */
-+/* { dg-require-effective-target vect_int } */
-+
-+#define N 32
-+
-+/* Clone of section-anchors-vect-70.c having nested struct.  */
-+
-+struct S
-+{
-+  int e[N];
-+};
-+
-+static struct A {
-+  int p1, p2;
-+  struct S s; 
-+} a, b, c;
-+
-+int foo(void)
-+{
-+  for (int i = 0; i < N; i++)
-+    a.s.e[i] = b.s.e[i] + c.s.e[i];
-+
-+   return a.s.e[0];
-+}
-+
-+/* { dg-final { scan-ipa-dump-times "Increasing alignment of decl" 0 "increase_alignment" { target aarch64*-*-* } } } */
-+/* { dg-final { scan-ipa-dump-times "Increasing alignment of decl" 0 "increase_alignment" { target powerpc64*-*-* } } } */
-+/* { dg-final { scan-ipa-dump-times "Increasing alignment of decl" 3 "increase_alignment" { target arm*-*-* } } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.dg/vect/pr57206.c
-@@ -0,0 +1,11 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target vect_float } */
-+
-+void bad0(float * d, unsigned int n)
-+{
-+  unsigned int i;
-+  for (i=n; i>0; --i) 
-+    d[n-i] = 0.0;
-+}
-+
-+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.dg/vect/pr65951.c
-@@ -0,0 +1,63 @@
-+/* { dg-require-effective-target vect_int } */
-+
-+#include <stdarg.h>
-+#include "tree-vect.h"
-+
-+#define N 512
-+
-+/* These multiplications should be vectorizable with additions when
-+   no vector shift is available.  */
-+
-+__attribute__ ((noinline)) void
-+foo (int *arr)
-+{
-+  for (int i = 0; i < N; i++)
-+    arr[i] *= 2;
-+}
-+
-+__attribute__ ((noinline)) void
-+foo2 (int *arr)
-+{
-+  for (int i = 0; i < N; i++)
-+    arr[i] *= 4;
-+}
-+
-+int
-+main (void)
-+{
-+  check_vect ();
-+  int data[N];
-+  int i;
-+
-+  for (i = 0; i < N; i++)
-+    {
-+      data[i] = i;
-+      __asm__ volatile ("");
-+    }
-+
-+  foo (data);
-+  for (i = 0; i < N; i++)
-+    {
-+      if (data[i] / 2 != i)
-+      __builtin_abort ();
-+      __asm__ volatile ("");
-+    }
-+
-+  for (i = 0; i < N; i++)
-+    {
-+      data[i] = i;
-+      __asm__ volatile ("");
-+    }
-+
-+  foo2 (data);
-+  for (i = 0; i < N; i++)
-+    {
-+      if (data[i] / 4 != i)
-+      __builtin_abort ();
-+      __asm__ volatile ("");
-+    }
-+
-+  return 0;
-+}
-+
-+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.dg/vect/pr71818.c
-@@ -0,0 +1,16 @@
-+/* { dg-do compile } */
-+
-+char a;
-+short b;
-+int c, d;
-+void fn1() {
-+  char e = 75, g;
-+  unsigned char *f = &e;
-+  a = 21;
-+  for (; a <= 48; a++) {
-+    for (; e <= 6;)
-+      ;
-+    g -= e -= b || g <= c;
-+  }
-+  d = *f;
-+}
---- a/src/gcc/testsuite/gcc.dg/vect/vect-iv-9.c
-+++ b/src/gcc/testsuite/gcc.dg/vect/vect-iv-9.c
-@@ -33,5 +33,4 @@ int main (void)
-   return 0;
- } 
- 
--/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target vect_int_mult } } } */
--/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target {! vect_int_mult } } } } */
-+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.dg/vect/vect-load-lanes-peeling-1.c
-@@ -0,0 +1,13 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target vect_int } */
-+/* { dg-require-effective-target vect_load_lanes } */
-+
-+void
-+f (int *__restrict a, int *__restrict b)
-+{
-+  for (int i = 0; i < 96; ++i)
-+    a[i] = b[i * 3] + b[i * 3 + 1] + b[i * 3 + 2];
-+}
-+
-+/* { dg-final { scan-tree-dump-not "Data access with gaps" "vect" } } */
-+/* { dg-final { scan-tree-dump-not "epilog loop required" "vect" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.dg/vect/vect-mult-const-pattern-1.c
-@@ -0,0 +1,41 @@
-+/* { dg-require-effective-target vect_int } */
-+/* { dg-require-effective-target vect_shift } */
-+
-+#include <stdarg.h>
-+#include "tree-vect.h"
-+
-+#define N 256
-+
-+__attribute__ ((noinline)) void
-+foo (long long *arr)
-+{
-+  for (int i = 0; i < N; i++)
-+    arr[i] *= 123;
-+}
-+
-+int
-+main (void)
-+{
-+  check_vect ();
-+  long long data[N];
-+  int i;
-+
-+  for (i = 0; i < N; i++)
-+    {
-+      data[i] = i;
-+      __asm__ volatile ("");
-+    }
-+
-+  foo (data);
-+  for (i = 0; i < N; i++)
-+    {
-+      if (data[i] / 123 != i)
-+      __builtin_abort ();
-+      __asm__ volatile ("");
-+    }
-+
-+  return 0;
-+}
-+
-+/* { dg-final { scan-tree-dump-times "vect_recog_mult_pattern: detected" 2 "vect"  { target aarch64*-*-* } } } */
-+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target aarch64*-*-* } } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.dg/vect/vect-mult-const-pattern-2.c
-@@ -0,0 +1,40 @@
-+/* { dg-require-effective-target vect_int } */
-+
-+#include <stdarg.h>
-+#include "tree-vect.h"
-+
-+#define N 256
-+
-+__attribute__ ((noinline)) void
-+foo (long long *arr)
-+{
-+  for (int i = 0; i < N; i++)
-+    arr[i] *= -19594LL;
-+}
-+
-+int
-+main (void)
-+{
-+  check_vect ();
-+  long long data[N];
-+  int i;
-+
-+  for (i = 0; i < N; i++)
-+    {
-+      data[i] = i;
-+      __asm__ volatile ("");
-+    }
-+
-+  foo (data);
-+  for (i = 0; i < N; i++)
-+    {
-+      if (data[i] / -19594LL != i)
-+      __builtin_abort ();
-+      __asm__ volatile ("");
-+    }
-+
-+  return 0;
-+}
-+
-+/* { dg-final { scan-tree-dump-times "vect_recog_mult_pattern: detected" 2 "vect"  { target aarch64*-*-* } } } */
-+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target aarch64*-*-* } } } */
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/advsimd-intrinsics.exp
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/advsimd-intrinsics.exp
-@@ -53,7 +53,10 @@ torture-init
- set-torture-options $C_TORTURE_OPTIONS {{}} $LTO_TORTURE_OPTIONS
- 
- # Make sure Neon flags are provided, if necessary.  Use fp16 if we can.
--if {[check_effective_target_arm_neon_fp16_ok]} then {
-+# Use fp16 arithmetic operations if the hardware supports it.
-+if {[check_effective_target_arm_v8_2a_fp16_neon_hw]} then {
-+  set additional_flags [add_options_for_arm_v8_2a_fp16_neon ""]
-+} elseif {[check_effective_target_arm_neon_fp16_ok]} then {
-   set additional_flags [add_options_for_arm_neon_fp16 ""]
- } else {
-   set additional_flags [add_options_for_arm_neon ""]
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
-@@ -16,6 +16,14 @@ extern void *memset(void *, int, size_t);
- extern void *memcpy(void *, const void *, size_t);
- extern size_t strlen(const char *);
- 
-+/* Helper macro to select FP16 tests.  */
-+#if (defined (__ARM_FP16_FORMAT_IEEE) \
-+     || defined (__ARM_FP16_FORMAT_ALTERNATIVE))
-+#define FP16_SUPPORTED (1)
-+#else
-+#undef FP16_SUPPORTED
-+#endif
-+
- /* Various string construction helpers.  */
- 
- /*
-@@ -24,6 +32,13 @@ extern size_t strlen(const char *);
-    VECT_VAR(expected, int, 16, 4) -> expected_int16x4
-    VECT_VAR_DECL(expected, int, 16, 4) -> int16x4_t expected_int16x4
- */
-+/* Some instructions don't exist on ARM.
-+   Use this macro to guard against them.  */
-+#ifdef __aarch64__
-+#define AARCH64_ONLY(X) X
-+#else
-+#define AARCH64_ONLY(X)
-+#endif
- 
- #define xSTR(X) #X
- #define STR(X) xSTR(X)
-@@ -81,7 +96,7 @@ extern size_t strlen(const char *);
- 	  abort();							\
- 	}								\
-       }									\
--    fprintf(stderr, "CHECKED %s\n", MSG);				\
-+    fprintf(stderr, "CHECKED %s %s\n", STR(VECT_TYPE(T, W, N)), MSG);	\
-   }
- 
- /* Floating-point variant.  */
-@@ -110,7 +125,36 @@ extern size_t strlen(const char *);
- 	  abort();							\
- 	}								\
-       }									\
--    fprintf(stderr, "CHECKED %s\n", MSG);				\
-+    fprintf(stderr, "CHECKED %s %s\n", STR(VECT_TYPE(T, W, N)), MSG);	\
-+  }
-+
-+/* poly variant.  */
-+#define CHECK_POLY(MSG,T,W,N,FMT,EXPECTED,COMMENT)			\
-+  {									\
-+    int i;								\
-+    for(i=0; i<N ; i++)							\
-+      {									\
-+	union poly_operand {						\
-+	  uint##W##_t i;						\
-+	  poly##W##_t p;						\
-+	} tmp_res, tmp_exp;						\
-+	tmp_res.p = VECT_VAR(result, T, W, N)[i];			\
-+	tmp_exp.i = VECT_VAR(EXPECTED, T, W, N)[i];			\
-+	if (tmp_res.i != tmp_exp.i) {					\
-+	  fprintf(stderr,						\
-+		  "ERROR in %s (%s line %d in buffer '%s') at type %s "	\
-+		  "index %d: got 0x%" FMT " != 0x%" FMT " %s\n",	\
-+		  MSG, __FILE__, __LINE__,				\
-+		  STR(EXPECTED),					\
-+		  STR(VECT_NAME(T, W, N)),				\
-+		  i,							\
-+		  tmp_res.i,						\
-+		  tmp_exp.i,						\
-+		  strlen(COMMENT) > 0 ? COMMENT : "");			\
-+	  abort();							\
-+	}								\
-+      }									\
-+    fprintf(stderr, "CHECKED %s %s\n", STR(VECT_TYPE(T, W, N)), MSG);	\
-   }
- 
- /* Clean buffer with a non-zero pattern to help diagnose buffer
-@@ -133,10 +177,16 @@ static ARRAY(result, uint, 32, 2);
- static ARRAY(result, uint, 64, 1);
- static ARRAY(result, poly, 8, 8);
- static ARRAY(result, poly, 16, 4);
-+#if defined (__ARM_FEATURE_CRYPTO)
-+static ARRAY(result, poly, 64, 1);
-+#endif
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
- static ARRAY(result, float, 16, 4);
- #endif
- static ARRAY(result, float, 32, 2);
-+#ifdef __aarch64__
-+static ARRAY(result, float, 64, 1);
-+#endif
- static ARRAY(result, int, 8, 16);
- static ARRAY(result, int, 16, 8);
- static ARRAY(result, int, 32, 4);
-@@ -147,6 +197,9 @@ static ARRAY(result, uint, 32, 4);
- static ARRAY(result, uint, 64, 2);
- static ARRAY(result, poly, 8, 16);
- static ARRAY(result, poly, 16, 8);
-+#if defined (__ARM_FEATURE_CRYPTO)
-+static ARRAY(result, poly, 64, 2);
-+#endif
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
- static ARRAY(result, float, 16, 8);
- #endif
-@@ -169,6 +222,7 @@ extern ARRAY(expected, poly, 8, 8);
- extern ARRAY(expected, poly, 16, 4);
- extern ARRAY(expected, hfloat, 16, 4);
- extern ARRAY(expected, hfloat, 32, 2);
-+extern ARRAY(expected, hfloat, 64, 1);
- extern ARRAY(expected, int, 8, 16);
- extern ARRAY(expected, int, 16, 8);
- extern ARRAY(expected, int, 32, 4);
-@@ -193,8 +247,8 @@ extern ARRAY(expected, hfloat, 64, 2);
-     CHECK(test_name, uint, 16, 4, PRIx16, EXPECTED, comment);		\
-     CHECK(test_name, uint, 32, 2, PRIx32, EXPECTED, comment);		\
-     CHECK(test_name, uint, 64, 1, PRIx64, EXPECTED, comment);		\
--    CHECK(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);		\
--    CHECK(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);		\
-+    CHECK_POLY(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);	\
-+    CHECK_POLY(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);	\
-     CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment);	\
- 									\
-     CHECK(test_name, int, 8, 16, PRIx8, EXPECTED, comment);		\
-@@ -205,8 +259,8 @@ extern ARRAY(expected, hfloat, 64, 2);
-     CHECK(test_name, uint, 16, 8, PRIx16, EXPECTED, comment);		\
-     CHECK(test_name, uint, 32, 4, PRIx32, EXPECTED, comment);		\
-     CHECK(test_name, uint, 64, 2, PRIx64, EXPECTED, comment);		\
--    CHECK(test_name, poly, 8, 16, PRIx8, EXPECTED, comment);		\
--    CHECK(test_name, poly, 16, 8, PRIx16, EXPECTED, comment);		\
-+    CHECK_POLY(test_name, poly, 8, 16, PRIx8, EXPECTED, comment);	\
-+    CHECK_POLY(test_name, poly, 16, 8, PRIx16, EXPECTED, comment);	\
-     CHECK_FP(test_name, float, 32, 4, PRIx32, EXPECTED, comment);	\
-   }									\
- 
-@@ -335,7 +389,8 @@ extern int VECT_VAR(expected_cumulative_sat, uint, 64, 2);
- 	      strlen(COMMENT) > 0 ? " " COMMENT : "");			\
-       abort();								\
-     }									\
--    fprintf(stderr, "CHECKED CUMULATIVE SAT %s\n", MSG);		\
-+    fprintf(stderr, "CHECKED CUMULATIVE SAT %s %s\n",			\
-+	    STR(VECT_TYPE(T, W, N)), MSG);				\
-   }
- 
- #define CHECK_CUMULATIVE_SAT_NAMED(test_name,EXPECTED,comment)		\
-@@ -379,6 +434,9 @@ static void clean_results (void)
-   CLEAN(result, uint, 64, 1);
-   CLEAN(result, poly, 8, 8);
-   CLEAN(result, poly, 16, 4);
-+#if defined (__ARM_FEATURE_CRYPTO)
-+  CLEAN(result, poly, 64, 1);
-+#endif
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-   CLEAN(result, float, 16, 4);
- #endif
-@@ -394,6 +452,9 @@ static void clean_results (void)
-   CLEAN(result, uint, 64, 2);
-   CLEAN(result, poly, 8, 16);
-   CLEAN(result, poly, 16, 8);
-+#if defined (__ARM_FEATURE_CRYPTO)
-+  CLEAN(result, poly, 64, 2);
-+#endif
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-   CLEAN(result, float, 16, 8);
- #endif
-@@ -419,6 +480,13 @@ static void clean_results (void)
- #define DECL_VARIABLE(VAR, T1, W, N)		\
-   VECT_TYPE(T1, W, N) VECT_VAR(VAR, T1, W, N)
- 
-+#if defined (__ARM_FEATURE_CRYPTO)
-+#define DECL_VARIABLE_CRYPTO(VAR, T1, W, N) \
-+  DECL_VARIABLE(VAR, T1, W, N)
-+#else
-+#define DECL_VARIABLE_CRYPTO(VAR, T1, W, N)
-+#endif
-+
- /* Declare only 64 bits signed variants.  */
- #define DECL_VARIABLE_64BITS_SIGNED_VARIANTS(VAR)	\
-   DECL_VARIABLE(VAR, int, 8, 8);			\
-@@ -454,6 +522,7 @@ static void clean_results (void)
-   DECL_VARIABLE_64BITS_UNSIGNED_VARIANTS(VAR);	\
-   DECL_VARIABLE(VAR, poly, 8, 8);		\
-   DECL_VARIABLE(VAR, poly, 16, 4);		\
-+  DECL_VARIABLE_CRYPTO(VAR, poly, 64, 1);	\
-   DECL_VARIABLE(VAR, float, 16, 4);		\
-   DECL_VARIABLE(VAR, float, 32, 2)
- #else
-@@ -462,6 +531,7 @@ static void clean_results (void)
-   DECL_VARIABLE_64BITS_UNSIGNED_VARIANTS(VAR);	\
-   DECL_VARIABLE(VAR, poly, 8, 8);		\
-   DECL_VARIABLE(VAR, poly, 16, 4);		\
-+  DECL_VARIABLE_CRYPTO(VAR, poly, 64, 1);	\
-   DECL_VARIABLE(VAR, float, 32, 2)
- #endif
- 
-@@ -472,6 +542,7 @@ static void clean_results (void)
-   DECL_VARIABLE_128BITS_UNSIGNED_VARIANTS(VAR);	\
-   DECL_VARIABLE(VAR, poly, 8, 16);		\
-   DECL_VARIABLE(VAR, poly, 16, 8);		\
-+  DECL_VARIABLE_CRYPTO(VAR, poly, 64, 2);	\
-   DECL_VARIABLE(VAR, float, 16, 8);		\
-   DECL_VARIABLE(VAR, float, 32, 4)
- #else
-@@ -480,6 +551,7 @@ static void clean_results (void)
-   DECL_VARIABLE_128BITS_UNSIGNED_VARIANTS(VAR);	\
-   DECL_VARIABLE(VAR, poly, 8, 16);		\
-   DECL_VARIABLE(VAR, poly, 16, 8);		\
-+  DECL_VARIABLE_CRYPTO(VAR, poly, 64, 2);	\
-   DECL_VARIABLE(VAR, float, 32, 4)
- #endif
- /* Declare all variants.  */
-@@ -500,15 +572,6 @@ static void clean_results (void)
- /* Helpers to initialize vectors.  */
- #define VDUP(VAR, Q, T1, T2, W, N, V)			\
-   VECT_VAR(VAR, T1, W, N) = vdup##Q##_n_##T2##W(V)
--#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
--/* Work around that there is no vdup_n_f16 intrinsic.  */
--#define vdup_n_f16(VAL)		\
--  __extension__			\
--    ({				\
--      float16_t f = VAL;	\
--      vld1_dup_f16(&f);		\
--    })
--#endif
- 
- #define VSET_LANE(VAR, Q, T1, T2, W, N, L, V)				\
-   VECT_VAR(VAR, T1, W, N) = vset##Q##_lane_##T2##W(V,			\
-@@ -521,6 +584,13 @@ static void clean_results (void)
- 
- /* Helpers to call macros with 1 constant and 5 variable
-    arguments.  */
-+#if defined (__ARM_FEATURE_CRYPTO)
-+#define MACRO_CRYPTO(MACRO, VAR1, VAR2, T1, T2, T3, W, N) \
-+  MACRO(VAR1, VAR2, T1, T2, T3, W, N)
-+#else
-+#define MACRO_CRYPTO(MACRO, VAR1, VAR2, T1, T2, T3, W, N)
-+#endif
-+
- #define TEST_MACRO_64BITS_SIGNED_VARIANTS_1_5(MACRO, VAR)	\
-   MACRO(VAR, , int, s, 8, 8);					\
-   MACRO(VAR, , int, s, 16, 4);					\
-@@ -591,13 +661,15 @@ static void clean_results (void)
-   TEST_MACRO_64BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
-   TEST_MACRO_64BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
-   MACRO(VAR1, VAR2, , poly, p, 8, 8);				\
--  MACRO(VAR1, VAR2, , poly, p, 16, 4)
-+  MACRO(VAR1, VAR2, , poly, p, 16, 4);				\
-+  MACRO_CRYPTO(MACRO, VAR1, VAR2, , poly, p, 64, 1)
- 
- #define TEST_MACRO_128BITS_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
-   TEST_MACRO_128BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
-   TEST_MACRO_128BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
-   MACRO(VAR1, VAR2, q, poly, p, 8, 16);				\
--  MACRO(VAR1, VAR2, q, poly, p, 16, 8)
-+  MACRO(VAR1, VAR2, q, poly, p, 16, 8);				\
-+  MACRO_CRYPTO(MACRO, VAR1, VAR2, q, poly, p, 64, 2)
- 
- #define TEST_MACRO_ALL_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
-   TEST_MACRO_64BITS_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/binary_op_float.inc
-@@ -0,0 +1,170 @@
-+/* Floating-point only version of binary_op_no64.inc template.  Currently only
-+   float16_t is used.  */
-+
-+#include <math.h>
-+
-+#define FNNAME1(NAME) exec_ ## NAME
-+#define FNNAME(NAME) FNNAME1(NAME)
-+
-+void FNNAME (INSN_NAME) (void)
-+{
-+  int i;
-+
-+  /* Basic test: z = INSN (x, y), then store the result.  */
-+#define TEST_BINARY_OP1(INSN, Q, T1, T2, W, N)				\
-+  VECT_VAR(vector_res, T1, W, N) =					\
-+    INSN##Q##_##T2##W(VECT_VAR(vector, T1, W, N),			\
-+		      VECT_VAR(vector2, T1, W, N));			\
-+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res, T1, W, N))
-+
-+#define TEST_BINARY_OP(INSN, Q, T1, T2, W, N)   \
-+  TEST_BINARY_OP1(INSN, Q, T1, T2, W, N)	\
-+
-+#ifdef HAS_FLOAT16_VARIANT
-+  DECL_VARIABLE(vector, float, 16, 4);
-+  DECL_VARIABLE(vector2, float, 16, 4);
-+  DECL_VARIABLE(vector_res, float, 16, 4);
-+
-+  DECL_VARIABLE(vector, float, 16, 8);
-+  DECL_VARIABLE(vector2, float, 16, 8);
-+  DECL_VARIABLE(vector_res, float, 16, 8);
-+#endif
-+
-+#ifdef HAS_FLOAT_VARIANT
-+  DECL_VARIABLE(vector, float, 32, 2);
-+  DECL_VARIABLE(vector2, float, 32, 2);
-+  DECL_VARIABLE(vector_res, float, 32, 2);
-+
-+  DECL_VARIABLE(vector, float, 32, 4);
-+  DECL_VARIABLE(vector2, float, 32, 4);
-+  DECL_VARIABLE(vector_res, float, 32, 4);
-+#endif
-+
-+  clean_results ();
-+
-+  /* Initialize input "vector" from "buffer".  */
-+#ifdef HAS_FLOAT16_VARIANT
-+  VLOAD(vector, buffer, , float, f, 16, 4);
-+  VLOAD(vector, buffer, q, float, f, 16, 8);
-+#endif
-+#ifdef HAS_FLOAT_VARIANT
-+  VLOAD(vector, buffer, , float, f, 32, 2);
-+  VLOAD(vector, buffer, q, float, f, 32, 4);
-+#endif
-+
-+  /* Choose init value arbitrarily, will be used as comparison value.  */
-+#ifdef HAS_FLOAT16_VARIANT
-+  VDUP(vector2, , float, f, 16, 4, -15.5f);
-+  VDUP(vector2, q, float, f, 16, 8, -14.5f);
-+#endif
-+#ifdef HAS_FLOAT_VARIANT
-+  VDUP(vector2, , float, f, 32, 2, -15.5f);
-+  VDUP(vector2, q, float, f, 32, 4, -14.5f);
-+#endif
-+
-+#ifdef HAS_FLOAT16_VARIANT
-+#define FLOAT16_VARIANT(MACRO, VAR)			\
-+  MACRO(VAR, , float, f, 16, 4);			\
-+  MACRO(VAR, q, float, f, 16, 8);
-+#else
-+#define FLOAT16_VARIANT(MACRO, VAR)
-+#endif
-+
-+#ifdef HAS_FLOAT_VARIANT
-+#define FLOAT_VARIANT(MACRO, VAR)			\
-+  MACRO(VAR, , float, f, 32, 2);			\
-+  MACRO(VAR, q, float, f, 32, 4);
-+#else
-+#define FLOAT_VARIANT(MACRO, VAR)
-+#endif
-+
-+#define TEST_MACRO_NO64BIT_VARIANT_1_5(MACRO, VAR)	\
-+
-+  /* Apply a binary operator named INSN_NAME.  */
-+  FLOAT16_VARIANT(TEST_BINARY_OP, INSN_NAME);
-+  FLOAT_VARIANT(TEST_BINARY_OP, INSN_NAME);
-+
-+#ifdef HAS_FLOAT16_VARIANT
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, "");
-+
-+  /* Extra FP tests with special values (NaN, ....)  */
-+  VDUP(vector, q, float, f, 16, 8, 1.0f);
-+  VDUP(vector2, q, float, f, 16, 8, NAN);
-+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_nan,
-+	   " FP special (NaN)");
-+
-+  VDUP(vector, q, float, f, 16, 8, -NAN);
-+  VDUP(vector2, q, float, f, 16, 8, 1.0f);
-+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_mnan,
-+	   " FP special (-NaN)");
-+
-+  VDUP(vector, q, float, f, 16, 8, 1.0f);
-+  VDUP(vector2, q, float, f, 16, 8, HUGE_VALF);
-+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_inf,
-+	   " FP special (inf)");
-+
-+  VDUP(vector, q, float, f, 16, 8, -HUGE_VALF);
-+  VDUP(vector2, q, float, f, 16, 8, 1.0f);
-+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_minf,
-+	   " FP special (-inf)");
-+
-+  VDUP(vector, q, float, f, 16, 8, 0.0f);
-+  VDUP(vector2, q, float, f, 16, 8, -0.0f);
-+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_zero1,
-+	   " FP special (-0.0)");
-+
-+  VDUP(vector, q, float, f, 16, 8, -0.0f);
-+  VDUP(vector2, q, float, f, 16, 8, 0.0f);
-+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_zero2,
-+	   " FP special (-0.0)");
-+#endif
-+
-+#ifdef HAS_FLOAT_VARIANT
-+  CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, "");
-+  CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected, "");
-+
-+  /* Extra FP tests with special values (NaN, ....)  */
-+  VDUP(vector, q, float, f, 32, 4, 1.0f);
-+  VDUP(vector2, q, float, f, 32, 4, NAN);
-+  TEST_BINARY_OP(INSN_NAME, q, float, f, 32, 4);
-+  CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_nan, " FP special (NaN)");
-+
-+  VDUP(vector, q, float, f, 32, 4, -NAN);
-+  VDUP(vector2, q, float, f, 32, 4, 1.0f);
-+  TEST_BINARY_OP(INSN_NAME, q, float, f, 32, 4);
-+  CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_mnan, " FP special (-NaN)");
-+
-+  VDUP(vector, q, float, f, 32, 4, 1.0f);
-+  VDUP(vector2, q, float, f, 32, 4, HUGE_VALF);
-+  TEST_BINARY_OP(INSN_NAME, q, float, f, 32, 4);
-+  CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_inf, " FP special (inf)");
-+
-+  VDUP(vector, q, float, f, 32, 4, -HUGE_VALF);
-+  VDUP(vector2, q, float, f, 32, 4, 1.0f);
-+  TEST_BINARY_OP(INSN_NAME, q, float, f, 32, 4);
-+  CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_minf, " FP special (-inf)");
-+
-+  VDUP(vector, q, float, f, 32, 4, 0.0f);
-+  VDUP(vector2, q, float, f, 32, 4, -0.0f);
-+  TEST_BINARY_OP(INSN_NAME, q, float, f, 32, 4);
-+  CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_zero1, " FP special (-0.0)");
-+
-+  VDUP(vector, q, float, f, 32, 4, -0.0f);
-+  VDUP(vector2, q, float, f, 32, 4, 0.0f);
-+  TEST_BINARY_OP(INSN_NAME, q, float, f, 32, 4);
-+  CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_zero2, " FP special (-0.0)");
-+#endif
-+}
-+
-+int main (void)
-+{
-+  FNNAME (INSN_NAME) ();
-+  return 0;
-+}
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/binary_op_no64.inc
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/binary_op_no64.inc
-@@ -28,6 +28,10 @@ void FNNAME (INSN_NAME) (void)
- 
-   /* Initialize input "vector" from "buffer".  */
-   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
-+#ifdef HAS_FLOAT16_VARIANT
-+  VLOAD(vector, buffer, , float, f, 16, 4);
-+  VLOAD(vector, buffer, q, float, f, 16, 8);
-+#endif
- #ifdef HAS_FLOAT_VARIANT
-   VLOAD(vector, buffer, , float, f, 32, 2);
-   VLOAD(vector, buffer, q, float, f, 32, 4);
-@@ -46,15 +50,27 @@ void FNNAME (INSN_NAME) (void)
-   VDUP(vector2, q, uint, u, 8, 16, 0xf9);
-   VDUP(vector2, q, uint, u, 16, 8, 0xfff2);
-   VDUP(vector2, q, uint, u, 32, 4, 0xfffffff1);
-+#ifdef HAS_FLOAT16_VARIANT
-+  VDUP(vector2, , float, f, 16, 4, -15.5f);
-+  VDUP(vector2, q, float, f, 16, 8, -14.5f);
-+#endif
- #ifdef HAS_FLOAT_VARIANT
-   VDUP(vector2, , float, f, 32, 2, -15.5f);
-   VDUP(vector2, q, float, f, 32, 4, -14.5f);
- #endif
- 
-+#ifdef HAS_FLOAT16_VARIANT
-+#define FLOAT16_VARIANT(MACRO, VAR)			\
-+  MACRO(VAR, , float, f, 16, 4);			\
-+  MACRO(VAR, q, float, f, 16, 8);
-+#else
-+#define FLOAT16_VARIANT(MACRO, VAR)
-+#endif
-+
- #ifdef HAS_FLOAT_VARIANT
- #define FLOAT_VARIANT(MACRO, VAR)			\
-   MACRO(VAR, , float, f, 32, 2);			\
--  MACRO(VAR, q, float, f, 32, 4)
-+  MACRO(VAR, q, float, f, 32, 4);
- #else
- #define FLOAT_VARIANT(MACRO, VAR)
- #endif
-@@ -72,7 +88,8 @@ void FNNAME (INSN_NAME) (void)
-   MACRO(VAR, q, uint, u, 8, 16);			\
-   MACRO(VAR, q, uint, u, 16, 8);			\
-   MACRO(VAR, q, uint, u, 32, 4);			\
--  FLOAT_VARIANT(MACRO, VAR)
-+  FLOAT_VARIANT(MACRO, VAR);				\
-+  FLOAT16_VARIANT(MACRO, VAR);
- 
-   /* Apply a binary operator named INSN_NAME.  */
-   TEST_MACRO_NO64BIT_VARIANT_1_5(TEST_BINARY_OP, INSN_NAME);
-@@ -90,6 +107,42 @@ void FNNAME (INSN_NAME) (void)
-   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
- 
-+#ifdef HAS_FLOAT16_VARIANT
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, "");
-+
-+  /* Extra FP tests with special values (NaN, ....)  */
-+  VDUP(vector, q, float, f, 16, 8, 1.0f);
-+  VDUP(vector2, q, float, f, 16, 8, NAN);
-+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_nan, " FP special (NaN)");
-+
-+  VDUP(vector, q, float, f, 16, 8, -NAN);
-+  VDUP(vector2, q, float, f, 16, 8, 1.0f);
-+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_mnan, " FP special (-NaN)");
-+
-+  VDUP(vector, q, float, f, 16, 8, 1.0f);
-+  VDUP(vector2, q, float, f, 16, 8, HUGE_VALF);
-+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_inf, " FP special (inf)");
-+
-+  VDUP(vector, q, float, f, 16, 8, -HUGE_VALF);
-+  VDUP(vector2, q, float, f, 16, 8, 1.0f);
-+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_minf, " FP special (-inf)");
-+
-+  VDUP(vector, q, float, f, 16, 8, 0.0f);
-+  VDUP(vector2, q, float, f, 16, 8, -0.0f);
-+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_zero1, " FP special (-0.0)");
-+
-+  VDUP(vector, q, float, f, 16, 8, -0.0f);
-+  VDUP(vector2, q, float, f, 16, 8, 0.0f);
-+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_zero2, " FP special (-0.0)");
-+#endif
-+
- #ifdef HAS_FLOAT_VARIANT
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, "");
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected, "");
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/binary_scalar_op.inc
-@@ -0,0 +1,160 @@
-+/* Template file for binary scalar operator validation.
-+
-+   This file is meant to be included by test files for binary scalar
-+   operations.  */
-+
-+/* Check for required settings.  */
-+
-+#ifndef INSN_NAME
-+#error INSN_NAME (the intrinsic to test) must be defined.
-+#endif
-+
-+#ifndef INPUT_TYPE
-+#error INPUT_TYPE (basic type of an input value) must be defined.
-+#endif
-+
-+#ifndef OUTPUT_TYPE
-+#error OUTPUT_TYPE (basic type of an output value) must be defined.
-+#endif
-+
-+#ifndef OUTPUT_TYPE_SIZE
-+#error OUTPUT_TYPE_SIZE (size in bits of an output value) must be defined.
-+#endif
-+
-+/* Optional settings:
-+
-+   INPUT_1: Input values for the first parameter.  Must be of type INPUT_TYPE.
-+   INPUT_2: Input values for the first parameter.  Must be of type
-+   INPUT_TYPE.  */
-+
-+#ifndef TEST_MSG
-+#define TEST_MSG "unnamed test"
-+#endif
-+
-+/* The test framework.  */
-+
-+#include <stdio.h>
-+
-+extern void abort ();
-+
-+#define INFF __builtin_inf ()
-+
-+/* Stringify a macro.  */
-+#define STR0(A) #A
-+#define STR(A) STR0 (A)
-+
-+/* Macro concatenation.  */
-+#define CAT0(A, B) A##B
-+#define CAT(A, B) CAT0 (A, B)
-+
-+/* Format strings for error reporting.  */
-+#define FMT16 "0x%04x"
-+#define FMT32 "0x%08x"
-+#define FMT CAT (FMT,OUTPUT_TYPE_SIZE)
-+
-+/* Type construction: forms TS_t, where T is the base type and S the size in
-+   bits.  */
-+#define MK_TYPE0(T, S) T##S##_t
-+#define MK_TYPE(T, S) MK_TYPE0 (T, S)
-+
-+/* Convenience types for input and output data.  */
-+typedef MK_TYPE (uint, OUTPUT_TYPE_SIZE) output_hex_type;
-+
-+/* Conversion between typed values and their hexadecimal representation.  */
-+typedef union
-+{
-+  OUTPUT_TYPE value;
-+  output_hex_type hex;
-+} output_conv_type;
-+
-+/* Default input values.  */
-+
-+float16_t input_1_float16_t[] =
-+{
-+  0.0, -0.0,
-+  2.0, 3.1,
-+  20.0, 0.40,
-+  -2.3, 1.33,
-+  -7.6, 0.31,
-+  0.3353, 0.5,
-+  1.0, 13.13,
-+  -6.3, 20.0,
-+  (float16_t)INFF, (float16_t)-INFF,
-+};
-+
-+float16_t input_2_float16_t[] =
-+{
-+  1.0, 1.0,
-+  -4.33, 100.0,
-+  30.0, -0.02,
-+  0.5, -7.231,
-+  -6.3, 20.0,
-+  -7.231, 2.3,
-+  -7.6, 5.1,
-+  0.31, 0.33353,
-+  (float16_t)-INFF, (float16_t)INFF,
-+};
-+
-+#ifndef INPUT_1
-+#define INPUT_1 CAT (input_1_,INPUT_TYPE)
-+#endif
-+
-+#ifndef INPUT_2
-+#define INPUT_2 CAT (input_2_,INPUT_TYPE)
-+#endif
-+
-+/* Support macros and routines for the test function.  */
-+
-+#define CHECK()						\
-+  {								\
-+    output_conv_type actual;					\
-+    output_conv_type expect;					\
-+								\
-+    expect.hex = ((output_hex_type*)EXPECTED)[index];		\
-+    actual.value = INSN_NAME ((INPUT_1)[index],			\
-+			      (INPUT_2)[index]);		\
-+								\
-+    if (actual.hex != expect.hex)				\
-+      {								\
-+	fprintf (stderr,					\
-+		 "ERROR in %s (%s line %d), buffer %s, "	\
-+		 "index %d: got "				\
-+		 FMT " != " FMT "\n",				\
-+		 TEST_MSG, __FILE__, __LINE__,			\
-+		 STR (EXPECTED), index,				\
-+		 actual.hex, expect.hex);			\
-+	abort ();						\
-+      }								\
-+    fprintf (stderr, "CHECKED %s %s\n",				\
-+	     STR (EXPECTED), TEST_MSG);				\
-+  }
-+
-+#define FNNAME1(NAME) exec_ ## NAME
-+#define FNNAME(NAME) FNNAME1 (NAME)
-+
-+/* The test function.  */
-+
-+void
-+FNNAME (INSN_NAME) (void)
-+{
-+  /* Basic test: y[i] = OP (x[i]), for each INPUT[i], then compare the result
-+     against EXPECTED[i].  */
-+
-+  const int num_tests = sizeof (INPUT_1) / sizeof (INPUT_1[0]);
-+  int index;
-+
-+  for (index = 0; index < num_tests; index++)
-+    CHECK ();
-+
-+#ifdef EXTRA_TESTS
-+  EXTRA_TESTS ();
-+#endif
-+}
-+
-+int
-+main (void)
-+{
-+  FNNAME (INSN_NAME) ();
-+
-+  return 0;
-+}
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/cmp_fp_op.inc
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/cmp_fp_op.inc
-@@ -15,6 +15,10 @@
-    each test file.  */
- extern ARRAY(expected2, uint, 32, 2);
- extern ARRAY(expected2, uint, 32, 4);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+extern ARRAY(expected2, uint, 16, 4);
-+extern ARRAY(expected2, uint, 16, 8);
-+#endif
- 
- #define FNNAME1(NAME) exec_ ## NAME
- #define FNNAME(NAME) FNNAME1(NAME)
-@@ -37,17 +41,33 @@ void FNNAME (INSN_NAME) (void)
-   DECL_VARIABLE(vector2, float, 32, 4);
-   DECL_VARIABLE(vector_res, uint, 32, 2);
-   DECL_VARIABLE(vector_res, uint, 32, 4);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector, float, 16, 4);
-+  DECL_VARIABLE(vector, float, 16, 8);
-+  DECL_VARIABLE(vector2, float, 16, 4);
-+  DECL_VARIABLE(vector2, float, 16, 8);
-+  DECL_VARIABLE(vector_res, uint, 16, 4);
-+  DECL_VARIABLE(vector_res, uint, 16, 8);
-+#endif
- 
-   clean_results ();
- 
-   /* Initialize input "vector" from "buffer".  */
-   VLOAD(vector, buffer, , float, f, 32, 2);
-   VLOAD(vector, buffer, q, float, f, 32, 4);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VLOAD(vector, buffer, , float, f, 16, 4);
-+  VLOAD(vector, buffer, q, float, f, 16, 8);
-+#endif
- 
-   /* Choose init value arbitrarily, will be used for vector
-      comparison.  */
-   VDUP(vector2, , float, f, 32, 2, -16.0f);
-   VDUP(vector2, q, float, f, 32, 4, -14.0f);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector2, , float, f, 16, 4, -16.0f);
-+  VDUP(vector2, q, float, f, 16, 8, -14.0f);
-+#endif
- 
-   /* Apply operator named INSN_NAME.  */
-   TEST_VCOMP(INSN_NAME, , float, f, uint, 32, 2);
-@@ -56,15 +76,36 @@ void FNNAME (INSN_NAME) (void)
-   TEST_VCOMP(INSN_NAME, q, float, f, uint, 32, 4);
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VCOMP(INSN_NAME, , float, f, uint, 16, 4);
-+  CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
-+
-+  TEST_VCOMP(INSN_NAME, q, float, f, uint, 16, 8);
-+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
-+#endif
-+
-   /* Test again, with different input values.  */
-   VDUP(vector2, , float, f, 32, 2, -10.0f);
-   VDUP(vector2, q, float, f, 32, 4, 10.0f);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector2, , float, f, 16, 4, -10.0f);
-+  VDUP(vector2, q, float, f, 16, 8, 10.0f);
-+#endif
-+
-   TEST_VCOMP(INSN_NAME, , float, f, uint, 32, 2);
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected2, "");
- 
-   TEST_VCOMP(INSN_NAME, q, float, f, uint, 32, 4);
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected2,"");
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VCOMP(INSN_NAME, , float, f, uint, 16, 4);
-+  CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected2, "");
-+
-+  TEST_VCOMP(INSN_NAME, q, float, f, uint, 16, 8);
-+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected2,"");
-+#endif
- }
- 
- int main (void)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/cmp_op.inc
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/cmp_op.inc
-@@ -11,6 +11,17 @@ extern ARRAY(expected_uint, uint, 32, 2);
- extern ARRAY(expected_q_uint, uint, 8, 16);
- extern ARRAY(expected_q_uint, uint, 16, 8);
- extern ARRAY(expected_q_uint, uint, 32, 4);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+extern ARRAY(expected_float, uint, 16, 4);
-+extern ARRAY(expected_q_float, uint, 16, 8);
-+extern ARRAY(expected_nan, uint, 16, 4);
-+extern ARRAY(expected_mnan, uint, 16, 4);
-+extern ARRAY(expected_nan2, uint, 16, 4);
-+extern ARRAY(expected_inf, uint, 16, 4);
-+extern ARRAY(expected_minf, uint, 16, 4);
-+extern ARRAY(expected_inf2, uint, 16, 4);
-+extern ARRAY(expected_mzero, uint, 16, 4);
-+#endif
- extern ARRAY(expected_float, uint, 32, 2);
- extern ARRAY(expected_q_float, uint, 32, 4);
- extern ARRAY(expected_uint2, uint, 32, 2);
-@@ -48,6 +59,9 @@ void FNNAME (INSN_NAME) (void)
-   DECL_VARIABLE(vector, uint, 8, 8);
-   DECL_VARIABLE(vector, uint, 16, 4);
-   DECL_VARIABLE(vector, uint, 32, 2);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE (vector, float, 16, 4);
-+#endif
-   DECL_VARIABLE(vector, float, 32, 2);
-   DECL_VARIABLE(vector, int, 8, 16);
-   DECL_VARIABLE(vector, int, 16, 8);
-@@ -55,6 +69,9 @@ void FNNAME (INSN_NAME) (void)
-   DECL_VARIABLE(vector, uint, 8, 16);
-   DECL_VARIABLE(vector, uint, 16, 8);
-   DECL_VARIABLE(vector, uint, 32, 4);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE (vector, float, 16, 8);
-+#endif
-   DECL_VARIABLE(vector, float, 32, 4);
- 
-   DECL_VARIABLE(vector2, int, 8, 8);
-@@ -63,6 +80,9 @@ void FNNAME (INSN_NAME) (void)
-   DECL_VARIABLE(vector2, uint, 8, 8);
-   DECL_VARIABLE(vector2, uint, 16, 4);
-   DECL_VARIABLE(vector2, uint, 32, 2);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE (vector2, float, 16, 4);
-+#endif
-   DECL_VARIABLE(vector2, float, 32, 2);
-   DECL_VARIABLE(vector2, int, 8, 16);
-   DECL_VARIABLE(vector2, int, 16, 8);
-@@ -70,6 +90,9 @@ void FNNAME (INSN_NAME) (void)
-   DECL_VARIABLE(vector2, uint, 8, 16);
-   DECL_VARIABLE(vector2, uint, 16, 8);
-   DECL_VARIABLE(vector2, uint, 32, 4);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE (vector2, float, 16, 8);
-+#endif
-   DECL_VARIABLE(vector2, float, 32, 4);
- 
-   DECL_VARIABLE(vector_res, uint, 8, 8);
-@@ -88,6 +111,9 @@ void FNNAME (INSN_NAME) (void)
-   VLOAD(vector, buffer, , uint, u, 8, 8);
-   VLOAD(vector, buffer, , uint, u, 16, 4);
-   VLOAD(vector, buffer, , uint, u, 32, 2);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VLOAD (vector, buffer, , float, f, 16, 4);
-+#endif
-   VLOAD(vector, buffer, , float, f, 32, 2);
- 
-   VLOAD(vector, buffer, q, int, s, 8, 16);
-@@ -96,6 +122,9 @@ void FNNAME (INSN_NAME) (void)
-   VLOAD(vector, buffer, q, uint, u, 8, 16);
-   VLOAD(vector, buffer, q, uint, u, 16, 8);
-   VLOAD(vector, buffer, q, uint, u, 32, 4);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VLOAD (vector, buffer, q, float, f, 16, 8);
-+#endif
-   VLOAD(vector, buffer, q, float, f, 32, 4);
- 
-   /* Choose init value arbitrarily, will be used for vector
-@@ -106,6 +135,9 @@ void FNNAME (INSN_NAME) (void)
-   VDUP(vector2, , uint, u, 8, 8, 0xF3);
-   VDUP(vector2, , uint, u, 16, 4, 0xFFF2);
-   VDUP(vector2, , uint, u, 32, 2, 0xFFFFFFF1);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP (vector2, , float, f, 16, 4, -15.0f);
-+#endif
-   VDUP(vector2, , float, f, 32, 2, -15.0f);
- 
-   VDUP(vector2, q, int, s, 8, 16, -4);
-@@ -114,6 +146,9 @@ void FNNAME (INSN_NAME) (void)
-   VDUP(vector2, q, uint, u, 8, 16, 0xF4);
-   VDUP(vector2, q, uint, u, 16, 8, 0xFFF6);
-   VDUP(vector2, q, uint, u, 32, 4, 0xFFFFFFF2);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP (vector2, q, float, f, 16, 8, -14.0f);
-+#endif
-   VDUP(vector2, q, float, f, 32, 4, -14.0f);
- 
-   /* The comparison operators produce only unsigned results, which
-@@ -154,9 +189,17 @@ void FNNAME (INSN_NAME) (void)
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_q_uint, "");
- 
-   /* The float variants.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
-+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_float, "");
-+#endif
-   TEST_VCOMP(INSN_NAME, , float, f, uint, 32, 2);
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_float, "");
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VCOMP (INSN_NAME, q, float, f, uint, 16, 8);
-+  CHECK (TEST_MSG, uint, 16, 8, PRIx16, expected_q_float, "");
-+#endif
-   TEST_VCOMP(INSN_NAME, q, float, f, uint, 32, 4);
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_q_float, "");
- 
-@@ -176,6 +219,43 @@ void FNNAME (INSN_NAME) (void)
- 
- 
-   /* Extra FP tests with special values (NaN, ....).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP (vector, , float, f, 16, 4, 1.0);
-+  VDUP (vector2, , float, f, 16, 4, NAN);
-+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
-+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_nan, "FP special (NaN)");
-+
-+  VDUP (vector, , float, f, 16, 4, 1.0);
-+  VDUP (vector2, , float, f, 16, 4, -NAN);
-+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
-+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_mnan, " FP special (-NaN)");
-+
-+  VDUP (vector, , float, f, 16, 4, NAN);
-+  VDUP (vector2, , float, f, 16, 4, 1.0);
-+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
-+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_nan2, " FP special (NaN)");
-+
-+  VDUP (vector, , float, f, 16, 4, 1.0);
-+  VDUP (vector2, , float, f, 16, 4, HUGE_VALF);
-+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
-+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_inf, " FP special (inf)");
-+
-+  VDUP (vector, , float, f, 16, 4, 1.0);
-+  VDUP (vector2, , float, f, 16, 4, -HUGE_VALF);
-+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
-+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_minf, " FP special (-inf)");
-+
-+  VDUP (vector, , float, f, 16, 4, HUGE_VALF);
-+  VDUP (vector2, , float, f, 16, 4, 1.0);
-+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
-+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_inf2, " FP special (inf)");
-+
-+  VDUP (vector, , float, f, 16, 4, -0.0);
-+  VDUP (vector2, , float, f, 16, 4, 0.0);
-+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
-+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_mzero, " FP special (-0.0)");
-+#endif
-+
-   VDUP(vector, , float, f, 32, 2, 1.0);
-   VDUP(vector2, , float, f, 32, 2, NAN);
-   TEST_VCOMP(INSN_NAME, , float, f, uint, 32, 2);
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/cmp_zero_op.inc
-@@ -0,0 +1,111 @@
-+/* Template file for the validation of compare against zero operators.
-+
-+   This file is base on cmp_op.inc.  It is meant to be included by the relevant
-+   test files, which have to define the intrinsic family to test.  If a given
-+   intrinsic supports variants which are not supported by all the other
-+   operators, these can be tested by providing a definition for EXTRA_TESTS.  */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+#include <math.h>
-+
-+/* Additional expected results declaration, they are initialized in
-+   each test file.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+extern ARRAY(expected_float, uint, 16, 4);
-+extern ARRAY(expected_q_float, uint, 16, 8);
-+extern ARRAY(expected_uint2, uint, 16, 4);
-+extern ARRAY(expected_uint3, uint, 16, 4);
-+extern ARRAY(expected_uint4, uint, 16, 4);
-+extern ARRAY(expected_nan, uint, 16, 4);
-+extern ARRAY(expected_mnan, uint, 16, 4);
-+extern ARRAY(expected_inf, uint, 16, 4);
-+extern ARRAY(expected_minf, uint, 16, 4);
-+extern ARRAY(expected_zero, uint, 16, 4);
-+extern ARRAY(expected_mzero, uint, 16, 4);
-+#endif
-+
-+#define FNNAME1(NAME) exec_ ## NAME
-+#define FNNAME(NAME) FNNAME1(NAME)
-+
-+void FNNAME (INSN_NAME) (void)
-+{
-+  /* Basic test: y=vcomp(x1,x2), then store the result.  */
-+#define TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N)				\
-+  VECT_VAR(vector_res, T3, W, N) =					\
-+    INSN##Q##_##T2##W(VECT_VAR(vector, T1, W, N));			\
-+  vst1##Q##_u##W(VECT_VAR(result, T3, W, N), VECT_VAR(vector_res, T3, W, N))
-+
-+#define TEST_VCOMP(INSN, Q, T1, T2, T3, W, N)				\
-+  TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N)
-+
-+  /* No need for 64 bits elements.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE (vector, float, 16, 4);
-+  DECL_VARIABLE (vector, float, 16, 8);
-+#endif
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector_res, uint, 16, 4);
-+  DECL_VARIABLE(vector_res, uint, 16, 8);
-+#endif
-+
-+  clean_results ();
-+
-+  /* Choose init value arbitrarily, will be used for vector
-+     comparison.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP (vector, , float, f, 16, 4, -15.0f);
-+  VDUP (vector, q, float, f, 16, 8, 14.0f);
-+#endif
-+
-+  /* Float variants.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
-+  TEST_VCOMP (INSN_NAME, q, float, f, uint, 16, 8);
-+#endif
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_float, "");
-+  CHECK (TEST_MSG, uint, 16, 8, PRIx16, expected_q_float, "");
-+#endif
-+
-+  /* Extra FP tests with special values (NaN, ....).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP (vector, , float, f, 16, 4, NAN);
-+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
-+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_nan, "FP special (NaN)");
-+
-+  VDUP (vector, , float, f, 16, 4, -NAN);
-+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
-+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_mnan, " FP special (-NaN)");
-+
-+  VDUP (vector, , float, f, 16, 4, HUGE_VALF);
-+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
-+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_inf, " FP special (inf)");
-+
-+  VDUP (vector, , float, f, 16, 4, -HUGE_VALF);
-+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
-+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_minf, " FP special (-inf)");
-+
-+  VDUP (vector, , float, f, 16, 4, 0.0);
-+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
-+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_zero, " FP special (0.0)");
-+
-+  VDUP (vector, , float, f, 16, 4, 0.0);
-+  TEST_VCOMP (INSN_NAME, , float, f, uint, 16, 4);
-+  CHECK (TEST_MSG, uint, 16, 4, PRIx16, expected_mzero, " FP special (-0.0)");
-+#endif
-+
-+#ifdef EXTRA_TESTS
-+  EXTRA_TESTS();
-+#endif
-+}
-+
-+int main (void)
-+{
-+  FNNAME (INSN_NAME) ();
-+
-+  return 0;
-+}
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/compute-ref-data.h
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/compute-ref-data.h
-@@ -118,6 +118,10 @@ VECT_VAR_DECL_INIT(buffer, uint, 32, 2);
- PAD(buffer_pad, uint, 32, 2);
- VECT_VAR_DECL_INIT(buffer, uint, 64, 1);
- PAD(buffer_pad, uint, 64, 1);
-+#if defined (__ARM_FEATURE_CRYPTO)
-+VECT_VAR_DECL_INIT(buffer, poly, 64, 1);
-+PAD(buffer_pad, poly, 64, 1);
-+#endif
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
- VECT_VAR_DECL_INIT(buffer, float, 16, 4);
- PAD(buffer_pad, float, 16, 4);
-@@ -144,6 +148,10 @@ VECT_VAR_DECL_INIT(buffer, poly, 8, 16);
- PAD(buffer_pad, poly, 8, 16);
- VECT_VAR_DECL_INIT(buffer, poly, 16, 8);
- PAD(buffer_pad, poly, 16, 8);
-+#if defined (__ARM_FEATURE_CRYPTO)
-+VECT_VAR_DECL_INIT(buffer, poly, 64, 2);
-+PAD(buffer_pad, poly, 64, 2);
-+#endif
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
- VECT_VAR_DECL_INIT(buffer, float, 16, 8);
- PAD(buffer_pad, float, 16, 8);
-@@ -178,6 +186,10 @@ VECT_VAR_DECL_INIT(buffer_dup, poly, 8, 8);
- VECT_VAR_DECL(buffer_dup_pad, poly, 8, 8);
- VECT_VAR_DECL_INIT(buffer_dup, poly, 16, 4);
- VECT_VAR_DECL(buffer_dup_pad, poly, 16, 4);
-+#if defined (__ARM_FEATURE_CRYPTO)
-+VECT_VAR_DECL_INIT4(buffer_dup, poly, 64, 1);
-+VECT_VAR_DECL(buffer_dup_pad, poly, 64, 1);
-+#endif
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
- VECT_VAR_DECL_INIT4(buffer_dup, float, 16, 4);
- VECT_VAR_DECL(buffer_dup_pad, float, 16, 4);
-@@ -205,6 +217,10 @@ VECT_VAR_DECL_INIT(buffer_dup, poly, 8, 16);
- VECT_VAR_DECL(buffer_dup_pad, poly, 8, 16);
- VECT_VAR_DECL_INIT(buffer_dup, poly, 16, 8);
- VECT_VAR_DECL(buffer_dup_pad, poly, 16, 8);
-+#if defined (__ARM_FEATURE_CRYPTO)
-+VECT_VAR_DECL_INIT4(buffer_dup, poly, 64, 2);
-+VECT_VAR_DECL(buffer_dup_pad, poly, 64, 2);
-+#endif
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
- VECT_VAR_DECL_INIT(buffer_dup, float, 16, 8);
- VECT_VAR_DECL(buffer_dup_pad, float, 16, 8);
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
-@@ -0,0 +1,1024 @@
-+/* This file contains tests for all the *p64 intrinsics, except for
-+   vreinterpret which have their own testcase.  */
-+
-+/* { dg-require-effective-target arm_crypto_ok { target { arm*-*-* } } } */
-+/* { dg-add-options arm_crypto } */
-+/* { dg-additional-options "-march=armv8-a+crypto" { target { aarch64*-*-* } } }*/
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+/* Expected results: vbsl.  */
-+VECT_VAR_DECL(vbsl_expected,poly,64,1) [] = { 0xfffffff1 };
-+VECT_VAR_DECL(vbsl_expected,poly,64,2) [] = { 0xfffffff1,
-+					      0xfffffff1 };
-+
-+/* Expected results: vceq.  */
-+VECT_VAR_DECL(vceq_expected,uint,64,1) [] = { 0x0 };
-+
-+/* Expected results: vcombine.  */
-+VECT_VAR_DECL(vcombine_expected,poly,64,2) [] = { 0xfffffffffffffff0, 0x88 };
-+
-+/* Expected results: vcreate.  */
-+VECT_VAR_DECL(vcreate_expected,poly,64,1) [] = { 0x123456789abcdef0 };
-+
-+/* Expected results: vdup_lane.  */
-+VECT_VAR_DECL(vdup_lane_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(vdup_lane_expected,poly,64,2) [] = { 0xfffffffffffffff0,
-+						   0xfffffffffffffff0 };
-+
-+/* Expected results: vdup_n.  */
-+VECT_VAR_DECL(vdup_n_expected0,poly,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(vdup_n_expected0,poly,64,2) [] = { 0xfffffffffffffff0,
-+						 0xfffffffffffffff0 };
-+VECT_VAR_DECL(vdup_n_expected1,poly,64,1) [] = { 0xfffffffffffffff1 };
-+VECT_VAR_DECL(vdup_n_expected1,poly,64,2) [] = { 0xfffffffffffffff1,
-+						 0xfffffffffffffff1 };
-+VECT_VAR_DECL(vdup_n_expected2,poly,64,1) [] = { 0xfffffffffffffff2 };
-+VECT_VAR_DECL(vdup_n_expected2,poly,64,2) [] = { 0xfffffffffffffff2,
-+						 0xfffffffffffffff2 };
-+
-+/* Expected results: vmov_n.  */
-+VECT_VAR_DECL(vmov_n_expected0,poly,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(vmov_n_expected0,poly,64,2) [] = { 0xfffffffffffffff0,
-+						 0xfffffffffffffff0 };
-+VECT_VAR_DECL(vmov_n_expected1,poly,64,1) [] = { 0xfffffffffffffff1 };
-+VECT_VAR_DECL(vmov_n_expected1,poly,64,2) [] = { 0xfffffffffffffff1,
-+						 0xfffffffffffffff1 };
-+VECT_VAR_DECL(vmov_n_expected2,poly,64,1) [] = { 0xfffffffffffffff2 };
-+VECT_VAR_DECL(vmov_n_expected2,poly,64,2) [] = { 0xfffffffffffffff2,
-+						 0xfffffffffffffff2 };
-+
-+/* Expected results: vext.  */
-+VECT_VAR_DECL(vext_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(vext_expected,poly,64,2) [] = { 0xfffffffffffffff1, 0x88 };
-+
-+/* Expected results: vget_low.  */
-+VECT_VAR_DECL(vget_low_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
-+
-+/* Expected results: vget_high.  */
-+VECT_VAR_DECL(vget_high_expected,poly,64,1) [] = { 0xfffffffffffffff1 };
-+
-+/* Expected results: vld1.  */
-+VECT_VAR_DECL(vld1_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(vld1_expected,poly,64,2) [] = { 0xfffffffffffffff0,
-+					      0xfffffffffffffff1 };
-+
-+/* Expected results: vld1_dup.  */
-+VECT_VAR_DECL(vld1_dup_expected0,poly,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(vld1_dup_expected0,poly,64,2) [] = { 0xfffffffffffffff0,
-+						   0xfffffffffffffff0 };
-+VECT_VAR_DECL(vld1_dup_expected1,poly,64,1) [] = { 0xfffffffffffffff1 };
-+VECT_VAR_DECL(vld1_dup_expected1,poly,64,2) [] = { 0xfffffffffffffff1,
-+						   0xfffffffffffffff1 };
-+VECT_VAR_DECL(vld1_dup_expected2,poly,64,1) [] = { 0xfffffffffffffff2 };
-+VECT_VAR_DECL(vld1_dup_expected2,poly,64,2) [] = { 0xfffffffffffffff2,
-+						   0xfffffffffffffff2 };
-+
-+/* Expected results: vld1_lane.  */
-+VECT_VAR_DECL(vld1_lane_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(vld1_lane_expected,poly,64,2) [] = { 0xfffffffffffffff0,
-+						   0xaaaaaaaaaaaaaaaa };
-+
-+/* Expected results: vldX.  */
-+VECT_VAR_DECL(vld2_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(vld2_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
-+VECT_VAR_DECL(vld3_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(vld3_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
-+VECT_VAR_DECL(vld3_expected_2,poly,64,1) [] = { 0xfffffffffffffff2 };
-+VECT_VAR_DECL(vld4_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(vld4_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
-+VECT_VAR_DECL(vld4_expected_2,poly,64,1) [] = { 0xfffffffffffffff2 };
-+VECT_VAR_DECL(vld4_expected_3,poly,64,1) [] = { 0xfffffffffffffff3 };
-+
-+/* Expected results: vldX_dup.  */
-+VECT_VAR_DECL(vld2_dup_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(vld2_dup_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
-+VECT_VAR_DECL(vld3_dup_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(vld3_dup_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
-+VECT_VAR_DECL(vld3_dup_expected_2,poly,64,1) [] = { 0xfffffffffffffff2 };
-+VECT_VAR_DECL(vld4_dup_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(vld4_dup_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
-+VECT_VAR_DECL(vld4_dup_expected_2,poly,64,1) [] = { 0xfffffffffffffff2 };
-+VECT_VAR_DECL(vld4_dup_expected_3,poly,64,1) [] = { 0xfffffffffffffff3 };
-+
-+/* Expected results: vsli.  */
-+VECT_VAR_DECL(vsli_expected,poly,64,1) [] = { 0x10 };
-+VECT_VAR_DECL(vsli_expected,poly,64,2) [] = { 0x7ffffffffffff0,
-+					      0x7ffffffffffff1 };
-+VECT_VAR_DECL(vsli_expected_max_shift,poly,64,1) [] = { 0x7ffffffffffffff0 };
-+VECT_VAR_DECL(vsli_expected_max_shift,poly,64,2) [] = { 0xfffffffffffffff0,
-+							0xfffffffffffffff1 };
-+
-+/* Expected results: vsri.  */
-+VECT_VAR_DECL(vsri_expected,poly,64,1) [] = { 0xe000000000000000 };
-+VECT_VAR_DECL(vsri_expected,poly,64,2) [] = { 0xfffffffffffff800,
-+					      0xfffffffffffff800 };
-+VECT_VAR_DECL(vsri_expected_max_shift,poly,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(vsri_expected_max_shift,poly,64,2) [] = { 0xfffffffffffffff0,
-+							0xfffffffffffffff1 };
-+
-+/* Expected results: vst1_lane.  */
-+VECT_VAR_DECL(vst1_lane_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(vst1_lane_expected,poly,64,2) [] = { 0xfffffffffffffff0,
-+						   0x3333333333333333 };
-+
-+/* Expected results: vldX_lane.  */
-+VECT_VAR_DECL(expected_vld_st2_0,poly,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(expected_vld_st2_0,poly,64,2) [] = { 0xfffffffffffffff0,
-+						   0xfffffffffffffff1 };
-+VECT_VAR_DECL(expected_vld_st2_1,poly,64,1) [] = { 0xfffffffffffffff1 };
-+VECT_VAR_DECL(expected_vld_st2_1,poly,64,2) [] = { 0xaaaaaaaaaaaaaaaa,
-+						   0xaaaaaaaaaaaaaaaa };
-+VECT_VAR_DECL(expected_vld_st3_0,poly,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(expected_vld_st3_0,poly,64,2) [] = { 0xfffffffffffffff0,
-+						   0xfffffffffffffff1 };
-+VECT_VAR_DECL(expected_vld_st3_1,poly,64,1) [] = { 0xfffffffffffffff1 };
-+VECT_VAR_DECL(expected_vld_st3_1,poly,64,2) [] = { 0xfffffffffffffff2,
-+						   0xaaaaaaaaaaaaaaaa };
-+VECT_VAR_DECL(expected_vld_st3_2,poly,64,1) [] = { 0xfffffffffffffff2 };
-+VECT_VAR_DECL(expected_vld_st3_2,poly,64,2) [] = { 0xaaaaaaaaaaaaaaaa,
-+						   0xaaaaaaaaaaaaaaaa };
-+VECT_VAR_DECL(expected_vld_st4_0,poly,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(expected_vld_st4_0,poly,64,2) [] = { 0xfffffffffffffff0,
-+						   0xfffffffffffffff1 };
-+VECT_VAR_DECL(expected_vld_st4_1,poly,64,1) [] = { 0xfffffffffffffff1 };
-+VECT_VAR_DECL(expected_vld_st4_1,poly,64,2) [] = { 0xfffffffffffffff2,
-+						   0xfffffffffffffff3 };
-+VECT_VAR_DECL(expected_vld_st4_2,poly,64,1) [] = { 0xfffffffffffffff2 };
-+VECT_VAR_DECL(expected_vld_st4_2,poly,64,2) [] = { 0xaaaaaaaaaaaaaaaa,
-+						   0xaaaaaaaaaaaaaaaa };
-+VECT_VAR_DECL(expected_vld_st4_3,poly,64,1) [] = { 0xfffffffffffffff3 };
-+VECT_VAR_DECL(expected_vld_st4_3,poly,64,2) [] = { 0xaaaaaaaaaaaaaaaa,
-+						   0xaaaaaaaaaaaaaaaa };
-+
-+/* Expected results: vget_lane.  */
-+VECT_VAR_DECL(vget_lane_expected,poly,64,1) = 0xfffffffffffffff0;
-+VECT_VAR_DECL(vget_lane_expected,poly,64,2) = 0xfffffffffffffff0;
-+
-+int main (void)
-+{
-+  int i;
-+
-+  /* vbsl_p64 tests.  */
-+#define TEST_MSG "VBSL/VBSLQ"
-+
-+#define TEST_VBSL(T3, Q, T1, T2, W, N)					\
-+  VECT_VAR(vbsl_vector_res, T1, W, N) =					\
-+    vbsl##Q##_##T2##W(VECT_VAR(vbsl_vector_first, T3, W, N),		\
-+		      VECT_VAR(vbsl_vector, T1, W, N),			\
-+		      VECT_VAR(vbsl_vector2, T1, W, N));		\
-+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vbsl_vector_res, T1, W, N))
-+
-+  DECL_VARIABLE(vbsl_vector, poly, 64, 1);
-+  DECL_VARIABLE(vbsl_vector, poly, 64, 2);
-+  DECL_VARIABLE(vbsl_vector2, poly, 64, 1);
-+  DECL_VARIABLE(vbsl_vector2, poly, 64, 2);
-+  DECL_VARIABLE(vbsl_vector_res, poly, 64, 1);
-+  DECL_VARIABLE(vbsl_vector_res, poly, 64, 2);
-+
-+  DECL_VARIABLE(vbsl_vector_first, uint, 64, 1);
-+  DECL_VARIABLE(vbsl_vector_first, uint, 64, 2);
-+
-+  CLEAN(result, poly, 64, 1);
-+  CLEAN(result, poly, 64, 2);
-+
-+  VLOAD(vbsl_vector, buffer, , poly, p, 64, 1);
-+  VLOAD(vbsl_vector, buffer, q, poly, p, 64, 2);
-+
-+  VDUP(vbsl_vector2, , poly, p, 64, 1, 0xFFFFFFF3);
-+  VDUP(vbsl_vector2, q, poly, p, 64, 2, 0xFFFFFFF3);
-+
-+  VDUP(vbsl_vector_first, , uint, u, 64, 1, 0xFFFFFFF2);
-+  VDUP(vbsl_vector_first, q, uint, u, 64, 2, 0xFFFFFFF2);
-+
-+  TEST_VBSL(uint, , poly, p, 64, 1);
-+  TEST_VBSL(uint, q, poly, p, 64, 2);
-+
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vbsl_expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vbsl_expected, "");
-+
-+  /* vceq_p64 tests. */
-+#undef TEST_MSG
-+#define TEST_MSG "VCEQ"
-+
-+#define TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N)				\
-+  VECT_VAR(vceq_vector_res, T3, W, N) =					\
-+    INSN##Q##_##T2##W(VECT_VAR(vceq_vector, T1, W, N),			\
-+		      VECT_VAR(vceq_vector2, T1, W, N));		\
-+  vst1##Q##_u##W(VECT_VAR(result, T3, W, N), VECT_VAR(vceq_vector_res, T3, W, N))
-+
-+#define TEST_VCOMP(INSN, Q, T1, T2, T3, W, N)				\
-+  TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N)
-+
-+  DECL_VARIABLE(vceq_vector, poly, 64, 1);
-+  DECL_VARIABLE(vceq_vector2, poly, 64, 1);
-+  DECL_VARIABLE(vceq_vector_res, uint, 64, 1);
-+
-+  CLEAN(result, uint, 64, 1);
-+
-+  VLOAD(vceq_vector, buffer, , poly, p, 64, 1);
-+
-+  VDUP(vceq_vector2, , poly, p, 64, 1, 0x88);
-+
-+  TEST_VCOMP(vceq, , poly, p, uint, 64, 1);
-+
-+  CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceq_expected, "");
-+
-+  /* vcombine_p64 tests.  */
-+#undef TEST_MSG
-+#define TEST_MSG "VCOMBINE"
-+
-+#define TEST_VCOMBINE(T1, T2, W, N, N2)					\
-+  VECT_VAR(vcombine_vector128, T1, W, N2) =				\
-+    vcombine_##T2##W(VECT_VAR(vcombine_vector64_a, T1, W, N),		\
-+		     VECT_VAR(vcombine_vector64_b, T1, W, N));		\
-+  vst1q_##T2##W(VECT_VAR(result, T1, W, N2), VECT_VAR(vcombine_vector128, T1, W, N2))
-+
-+  DECL_VARIABLE(vcombine_vector64_a, poly, 64, 1);
-+  DECL_VARIABLE(vcombine_vector64_b, poly, 64, 1);
-+  DECL_VARIABLE(vcombine_vector128, poly, 64, 2);
-+
-+  CLEAN(result, poly, 64, 2);
-+
-+  VLOAD(vcombine_vector64_a, buffer, , poly, p, 64, 1);
-+
-+  VDUP(vcombine_vector64_b, , poly, p, 64, 1, 0x88);
-+
-+  TEST_VCOMBINE(poly, p, 64, 1, 2);
-+
-+  CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vcombine_expected, "");
-+
-+  /* vcreate_p64 tests.  */
-+#undef TEST_MSG
-+#define TEST_MSG "VCREATE"
-+
-+#define TEST_VCREATE(T1, T2, W, N)					\
-+  VECT_VAR(vcreate_vector_res, T1, W, N) =				\
-+    vcreate_##T2##W(VECT_VAR(vcreate_val, T1, W, N));			\
-+  vst1_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vcreate_vector_res, T1, W, N))
-+
-+#define DECL_VAL(VAR, T1, W, N)			\
-+  uint64_t VECT_VAR(VAR, T1, W, N)
-+
-+  DECL_VAL(vcreate_val, poly, 64, 1);
-+  DECL_VARIABLE(vcreate_vector_res, poly, 64, 1);
-+
-+  CLEAN(result, poly, 64, 2);
-+
-+  VECT_VAR(vcreate_val, poly, 64, 1) = 0x123456789abcdef0ULL;
-+
-+  TEST_VCREATE(poly, p, 64, 1);
-+
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vcreate_expected, "");
-+
-+  /* vdup_lane_p64 tests.  */
-+#undef TEST_MSG
-+#define TEST_MSG "VDUP_LANE/VDUP_LANEQ"
-+
-+#define TEST_VDUP_LANE(Q, T1, T2, W, N, N2, L)				\
-+  VECT_VAR(vdup_lane_vector_res, T1, W, N) =				\
-+    vdup##Q##_lane_##T2##W(VECT_VAR(vdup_lane_vector, T1, W, N2), L);	\
-+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vdup_lane_vector_res, T1, W, N))
-+
-+  DECL_VARIABLE(vdup_lane_vector, poly, 64, 1);
-+  DECL_VARIABLE(vdup_lane_vector, poly, 64, 2);
-+  DECL_VARIABLE(vdup_lane_vector_res, poly, 64, 1);
-+  DECL_VARIABLE(vdup_lane_vector_res, poly, 64, 2);
-+
-+  CLEAN(result, poly, 64, 1);
-+  CLEAN(result, poly, 64, 2);
-+
-+  VLOAD(vdup_lane_vector, buffer, , poly, p, 64, 1);
-+
-+  TEST_VDUP_LANE(, poly, p, 64, 1, 1, 0);
-+  TEST_VDUP_LANE(q, poly, p, 64, 2, 1, 0);
-+
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vdup_lane_expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vdup_lane_expected, "");
-+
-+  /* vdup_n_p64 tests.  */
-+#undef TEST_MSG
-+#define TEST_MSG "VDUP/VDUPQ"
-+
-+#define TEST_VDUP(Q, T1, T2, W, N)					\
-+  VECT_VAR(vdup_n_vector, T1, W, N) =					\
-+    vdup##Q##_n_##T2##W(VECT_VAR(buffer_dup, T1, W, N)[i]);		\
-+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vdup_n_vector, T1, W, N))
-+
-+  DECL_VARIABLE(vdup_n_vector, poly, 64, 1);
-+  DECL_VARIABLE(vdup_n_vector, poly, 64, 2);
-+
-+  /* Try to read different places from the input buffer.  */
-+  for (i=0; i< 3; i++) {
-+    CLEAN(result, poly, 64, 1);
-+    CLEAN(result, poly, 64, 2);
-+
-+    TEST_VDUP(, poly, p, 64, 1);
-+    TEST_VDUP(q, poly, p, 64, 2);
-+
-+    switch (i) {
-+    case 0:
-+      CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vdup_n_expected0, "");
-+      CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vdup_n_expected0, "");
-+      break;
-+    case 1:
-+      CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vdup_n_expected1, "");
-+      CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vdup_n_expected1, "");
-+      break;
-+    case 2:
-+      CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vdup_n_expected2, "");
-+      CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vdup_n_expected2, "");
-+      break;
-+    default:
-+      abort();
-+    }
-+  }
-+
-+  /* vexit_p64 tests.  */
-+#undef TEST_MSG
-+#define TEST_MSG "VEXT/VEXTQ"
-+
-+#define TEST_VEXT(Q, T1, T2, W, N, V)					\
-+  VECT_VAR(vext_vector_res, T1, W, N) =					\
-+    vext##Q##_##T2##W(VECT_VAR(vext_vector1, T1, W, N),			\
-+		      VECT_VAR(vext_vector2, T1, W, N),			\
-+		      V);						\
-+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vext_vector_res, T1, W, N))
-+
-+  DECL_VARIABLE(vext_vector1, poly, 64, 1);
-+  DECL_VARIABLE(vext_vector1, poly, 64, 2);
-+  DECL_VARIABLE(vext_vector2, poly, 64, 1);
-+  DECL_VARIABLE(vext_vector2, poly, 64, 2);
-+  DECL_VARIABLE(vext_vector_res, poly, 64, 1);
-+  DECL_VARIABLE(vext_vector_res, poly, 64, 2);
-+
-+  CLEAN(result, poly, 64, 1);
-+  CLEAN(result, poly, 64, 2);
-+
-+  VLOAD(vext_vector1, buffer, , poly, p, 64, 1);
-+  VLOAD(vext_vector1, buffer, q, poly, p, 64, 2);
-+
-+  VDUP(vext_vector2, , poly, p, 64, 1, 0x88);
-+  VDUP(vext_vector2, q, poly, p, 64, 2, 0x88);
-+
-+  TEST_VEXT(, poly, p, 64, 1, 0);
-+  TEST_VEXT(q, poly, p, 64, 2, 1);
-+
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vext_expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vext_expected, "");
-+
-+  /* vget_low_p64 tests.  */
-+#undef TEST_MSG
-+#define TEST_MSG "VGET_LOW"
-+
-+#define TEST_VGET_LOW(T1, T2, W, N, N2)					\
-+  VECT_VAR(vget_low_vector64, T1, W, N) =				\
-+    vget_low_##T2##W(VECT_VAR(vget_low_vector128, T1, W, N2));		\
-+  vst1_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vget_low_vector64, T1, W, N))
-+
-+  DECL_VARIABLE(vget_low_vector64, poly, 64, 1);
-+  DECL_VARIABLE(vget_low_vector128, poly, 64, 2);
-+
-+  CLEAN(result, poly, 64, 1);
-+
-+  VLOAD(vget_low_vector128, buffer, q, poly, p, 64, 2);
-+
-+  TEST_VGET_LOW(poly, p, 64, 1, 2);
-+
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vget_low_expected, "");
-+
-+  /* vget_high_p64 tests.  */
-+#undef TEST_MSG
-+#define TEST_MSG "VGET_HIGH"
-+
-+#define TEST_VGET_HIGH(T1, T2, W, N, N2)					\
-+  VECT_VAR(vget_high_vector64, T1, W, N) =				\
-+    vget_high_##T2##W(VECT_VAR(vget_high_vector128, T1, W, N2));		\
-+  vst1_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vget_high_vector64, T1, W, N))
-+
-+  DECL_VARIABLE(vget_high_vector64, poly, 64, 1);
-+  DECL_VARIABLE(vget_high_vector128, poly, 64, 2);
-+
-+  CLEAN(result, poly, 64, 1);
-+
-+  VLOAD(vget_high_vector128, buffer, q, poly, p, 64, 2);
-+
-+  TEST_VGET_HIGH(poly, p, 64, 1, 2);
-+
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vget_high_expected, "");
-+
-+  /* vld1_p64 tests.  */
-+#undef TEST_MSG
-+#define TEST_MSG "VLD1/VLD1Q"
-+
-+#define TEST_VLD1(VAR, BUF, Q, T1, T2, W, N)				\
-+  VECT_VAR(VAR, T1, W, N) = vld1##Q##_##T2##W(VECT_VAR(BUF, T1, W, N)); \
-+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(VAR, T1, W, N))
-+
-+  DECL_VARIABLE(vld1_vector, poly, 64, 1);
-+  DECL_VARIABLE(vld1_vector, poly, 64, 2);
-+
-+  CLEAN(result, poly, 64, 1);
-+  CLEAN(result, poly, 64, 2);
-+
-+  VLOAD(vld1_vector, buffer, , poly, p, 64, 1);
-+  VLOAD(vld1_vector, buffer, q, poly, p, 64, 2);
-+
-+  TEST_VLD1(vld1_vector, buffer, , poly, p, 64, 1);
-+  TEST_VLD1(vld1_vector, buffer, q, poly, p, 64, 2);
-+
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld1_expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vld1_expected, "");
-+
-+  /* vld1_dup_p64 tests.  */
-+#undef TEST_MSG
-+#define TEST_MSG "VLD1_DUP/VLD1_DUPQ"
-+
-+#define TEST_VLD1_DUP(VAR, BUF, Q, T1, T2, W, N)			\
-+  VECT_VAR(VAR, T1, W, N) =						\
-+    vld1##Q##_dup_##T2##W(&VECT_VAR(BUF, T1, W, N)[i]);			\
-+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(VAR, T1, W, N))
-+
-+  DECL_VARIABLE(vld1_dup_vector, poly, 64, 1);
-+  DECL_VARIABLE(vld1_dup_vector, poly, 64, 2);
-+
-+  /* Try to read different places from the input buffer.  */
-+  for (i=0; i<3; i++) {
-+    CLEAN(result, poly, 64, 1);
-+    CLEAN(result, poly, 64, 2);
-+
-+    TEST_VLD1_DUP(vld1_dup_vector, buffer_dup, , poly, p, 64, 1);
-+    TEST_VLD1_DUP(vld1_dup_vector, buffer_dup, q, poly, p, 64, 2);
-+
-+    switch (i) {
-+    case 0:
-+      CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld1_dup_expected0, "");
-+      CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vld1_dup_expected0, "");
-+      break;
-+    case 1:
-+      CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld1_dup_expected1, "");
-+      CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vld1_dup_expected1, "");
-+      break;
-+    case 2:
-+      CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld1_dup_expected2, "");
-+      CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vld1_dup_expected2, "");
-+      break;
-+    default:
-+      abort();
-+    }
-+  }
-+
-+  /* vld1_lane_p64 tests.  */
-+#undef TEST_MSG
-+#define TEST_MSG "VLD1_LANE/VLD1_LANEQ"
-+
-+#define TEST_VLD1_LANE(Q, T1, T2, W, N, L)				\
-+  memset (VECT_VAR(vld1_lane_buffer_src, T1, W, N), 0xAA, W/8*N);	\
-+  VECT_VAR(vld1_lane_vector_src, T1, W, N) =				\
-+    vld1##Q##_##T2##W(VECT_VAR(vld1_lane_buffer_src, T1, W, N));	\
-+  VECT_VAR(vld1_lane_vector, T1, W, N) =				\
-+    vld1##Q##_lane_##T2##W(VECT_VAR(buffer, T1, W, N),			\
-+			   VECT_VAR(vld1_lane_vector_src, T1, W, N), L); \
-+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vld1_lane_vector, T1, W, N))
-+
-+  DECL_VARIABLE(vld1_lane_vector, poly, 64, 1);
-+  DECL_VARIABLE(vld1_lane_vector, poly, 64, 2);
-+  DECL_VARIABLE(vld1_lane_vector_src, poly, 64, 1);
-+  DECL_VARIABLE(vld1_lane_vector_src, poly, 64, 2);
-+
-+  ARRAY(vld1_lane_buffer_src, poly, 64, 1);
-+  ARRAY(vld1_lane_buffer_src, poly, 64, 2);
-+
-+  CLEAN(result, poly, 64, 1);
-+  CLEAN(result, poly, 64, 2);
-+
-+  TEST_VLD1_LANE(, poly, p, 64, 1, 0);
-+  TEST_VLD1_LANE(q, poly, p, 64, 2, 0);
-+
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld1_lane_expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vld1_lane_expected, "");
-+
-+  /* vldX_p64 tests.  */
-+#define DECL_VLDX(T1, W, N, X)						\
-+  VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vldX_vector, T1, W, N, X); \
-+  VECT_VAR_DECL(vldX_result_bis_##X, T1, W, N)[X * N]
-+
-+#define TEST_VLDX(Q, T1, T2, W, N, X)					\
-+  VECT_ARRAY_VAR(vldX_vector, T1, W, N, X) =				\
-+    /* Use dedicated init buffer, of size X */				\
-+    vld##X##Q##_##T2##W(VECT_ARRAY_VAR(buffer_vld##X, T1, W, N, X));	\
-+  vst##X##Q##_##T2##W(VECT_VAR(vldX_result_bis_##X, T1, W, N),		\
-+		      VECT_ARRAY_VAR(vldX_vector, T1, W, N, X));	\
-+  memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(vldX_result_bis_##X, T1, W, N), \
-+	 sizeof(VECT_VAR(result, T1, W, N)));
-+
-+  /* Overwrite "result" with the contents of "result_bis"[Y].  */
-+#define TEST_EXTRA_CHUNK(T1, W, N, X,Y)				\
-+  memcpy(VECT_VAR(result, T1, W, N),				\
-+	 &(VECT_VAR(vldX_result_bis_##X, T1, W, N)[Y*N]),	\
-+	 sizeof(VECT_VAR(result, T1, W, N)));
-+
-+  DECL_VLDX(poly, 64, 1, 2);
-+  DECL_VLDX(poly, 64, 1, 3);
-+  DECL_VLDX(poly, 64, 1, 4);
-+
-+  VECT_ARRAY_INIT2(buffer_vld2, poly, 64, 1);
-+  PAD(buffer_vld2_pad, poly, 64, 1);
-+  VECT_ARRAY_INIT3(buffer_vld3, poly, 64, 1);
-+  PAD(buffer_vld3_pad, poly, 64, 1);
-+  VECT_ARRAY_INIT4(buffer_vld4, poly, 64, 1);
-+  PAD(buffer_vld4_pad, poly, 64, 1);
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VLD2/VLD2Q"
-+  CLEAN(result, poly, 64, 1);
-+  TEST_VLDX(, poly, p, 64, 1, 2);
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld2_expected_0, "chunk 0");
-+  CLEAN(result, poly, 64, 1);
-+  TEST_EXTRA_CHUNK(poly, 64, 1, 2, 1);
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld2_expected_1, "chunk 1");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VLD3/VLD3Q"
-+  CLEAN(result, poly, 64, 1);
-+  TEST_VLDX(, poly, p, 64, 1, 3);
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld3_expected_0, "chunk 0");
-+  CLEAN(result, poly, 64, 1);
-+  TEST_EXTRA_CHUNK(poly, 64, 1, 3, 1);
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld3_expected_1, "chunk 1");
-+  CLEAN(result, poly, 64, 1);
-+  TEST_EXTRA_CHUNK(poly, 64, 1, 3, 2);
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld3_expected_2, "chunk 2");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VLD4/VLD4Q"
-+  CLEAN(result, poly, 64, 1);
-+  TEST_VLDX(, poly, p, 64, 1, 4);
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_0, "chunk 0");
-+  CLEAN(result, poly, 64, 1);
-+  TEST_EXTRA_CHUNK(poly, 64, 1, 4, 1);
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_1, "chunk 1");
-+  CLEAN(result, poly, 64, 1);
-+  TEST_EXTRA_CHUNK(poly, 64, 1, 4, 2);
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_2, "chunk 2");
-+  CLEAN(result, poly, 64, 1);
-+  TEST_EXTRA_CHUNK(poly, 64, 1, 4, 3);
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_3, "chunk 3");
-+
-+  /* vldX_dup_p64 tests.  */
-+#define DECL_VLDX_DUP(T1, W, N, X)					\
-+  VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vldX_dup_vector, T1, W, N, X); \
-+  VECT_VAR_DECL(vldX_dup_result_bis_##X, T1, W, N)[X * N]
-+
-+#define TEST_VLDX_DUP(Q, T1, T2, W, N, X)				\
-+  VECT_ARRAY_VAR(vldX_dup_vector, T1, W, N, X) =			\
-+    vld##X##Q##_dup_##T2##W(&VECT_VAR(buffer_dup, T1, W, N)[0]);	\
-+    									\
-+  vst##X##Q##_##T2##W(VECT_VAR(vldX_dup_result_bis_##X, T1, W, N),	\
-+		      VECT_ARRAY_VAR(vldX_dup_vector, T1, W, N, X));	\
-+  memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(vldX_dup_result_bis_##X, T1, W, N), \
-+	 sizeof(VECT_VAR(result, T1, W, N)));
-+
-+  /* Overwrite "result" with the contents of "result_bis"[Y].  */
-+#define TEST_VLDX_DUP_EXTRA_CHUNK(T1, W, N, X,Y)		\
-+  memcpy(VECT_VAR(result, T1, W, N),				\
-+	 &(VECT_VAR(vldX_dup_result_bis_##X, T1, W, N)[Y*N]),	\
-+	 sizeof(VECT_VAR(result, T1, W, N)));
-+
-+  DECL_VLDX_DUP(poly, 64, 1, 2);
-+  DECL_VLDX_DUP(poly, 64, 1, 3);
-+  DECL_VLDX_DUP(poly, 64, 1, 4);
-+
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VLD2_DUP/VLD2Q_DUP"
-+  CLEAN(result, poly, 64, 1);
-+  TEST_VLDX_DUP(, poly, p, 64, 1, 2);
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld2_dup_expected_0, "chunk 0");
-+  CLEAN(result, poly, 64, 1);
-+  TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 2, 1);
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld2_dup_expected_1, "chunk 1");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VLD3_DUP/VLD3Q_DUP"
-+  CLEAN(result, poly, 64, 1);
-+  TEST_VLDX_DUP(, poly, p, 64, 1, 3);
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld3_dup_expected_0, "chunk 0");
-+  CLEAN(result, poly, 64, 1);
-+  TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 3, 1);
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld3_dup_expected_1, "chunk 1");
-+  CLEAN(result, poly, 64, 1);
-+  TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 3, 2);
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld3_dup_expected_2, "chunk 2");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VLD4_DUP/VLD4Q_DUP"
-+  CLEAN(result, poly, 64, 1);
-+  TEST_VLDX_DUP(, poly, p, 64, 1, 4);
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_0, "chunk 0");
-+  CLEAN(result, poly, 64, 1);
-+  TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 4, 1);
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_1, "chunk 1");
-+  CLEAN(result, poly, 64, 1);
-+  TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 4, 2);
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_2, "chunk 2");
-+  CLEAN(result, poly, 64, 1);
-+  TEST_VLDX_DUP_EXTRA_CHUNK(poly, 64, 1, 4, 3);
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_3, "chunk 3");
-+
-+  /* vsli_p64 tests.  */
-+#undef TEST_MSG
-+#define TEST_MSG "VSLI"
-+
-+#define TEST_VSXI1(INSN, Q, T1, T2, W, N, V)				\
-+  VECT_VAR(vsXi_vector_res, T1, W, N) =					\
-+    INSN##Q##_n_##T2##W(VECT_VAR(vsXi_vector, T1, W, N),		\
-+		      VECT_VAR(vsXi_vector2, T1, W, N),			\
-+		      V);						\
-+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vsXi_vector_res, T1, W, N))
-+
-+#define TEST_VSXI(INSN, Q, T1, T2, W, N, V)	\
-+  TEST_VSXI1(INSN, Q, T1, T2, W, N, V)
-+
-+  DECL_VARIABLE(vsXi_vector, poly, 64, 1);
-+  DECL_VARIABLE(vsXi_vector, poly, 64, 2);
-+  DECL_VARIABLE(vsXi_vector2, poly, 64, 1);
-+  DECL_VARIABLE(vsXi_vector2, poly, 64, 2);
-+  DECL_VARIABLE(vsXi_vector_res, poly, 64, 1);
-+  DECL_VARIABLE(vsXi_vector_res, poly, 64, 2);
-+
-+  CLEAN(result, poly, 64, 1);
-+  CLEAN(result, poly, 64, 2);
-+
-+  VLOAD(vsXi_vector, buffer, , poly, p, 64, 1);
-+  VLOAD(vsXi_vector, buffer, q, poly, p, 64, 2);
-+
-+  VDUP(vsXi_vector2, , poly, p, 64, 1, 2);
-+  VDUP(vsXi_vector2, q, poly, p, 64, 2, 3);
-+
-+  TEST_VSXI(vsli, , poly, p, 64, 1, 3);
-+  TEST_VSXI(vsli, q, poly, p, 64, 2, 53);
-+
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vsli_expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vsli_expected, "");
-+
-+  /* Test cases with maximum shift amount.  */
-+  CLEAN(result, poly, 64, 1);
-+  CLEAN(result, poly, 64, 2);
-+
-+  TEST_VSXI(vsli, , poly, p, 64, 1, 63);
-+  TEST_VSXI(vsli, q, poly, p, 64, 2, 63);
-+
-+#define COMMENT "(max shift amount)"
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vsli_expected_max_shift, COMMENT);
-+  CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vsli_expected_max_shift, COMMENT);
-+
-+  /* vsri_p64 tests.  */
-+#undef TEST_MSG
-+#define TEST_MSG "VSRI"
-+
-+  CLEAN(result, poly, 64, 1);
-+  CLEAN(result, poly, 64, 2);
-+
-+  VLOAD(vsXi_vector, buffer, , poly, p, 64, 1);
-+  VLOAD(vsXi_vector, buffer, q, poly, p, 64, 2);
-+
-+  VDUP(vsXi_vector2, , poly, p, 64, 1, 2);
-+  VDUP(vsXi_vector2, q, poly, p, 64, 2, 3);
-+
-+  TEST_VSXI(vsri, , poly, p, 64, 1, 3);
-+  TEST_VSXI(vsri, q, poly, p, 64, 2, 53);
-+
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vsri_expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vsri_expected, "");
-+
-+  /* Test cases with maximum shift amount.  */
-+  CLEAN(result, poly, 64, 1);
-+  CLEAN(result, poly, 64, 2);
-+
-+  TEST_VSXI(vsri, , poly, p, 64, 1, 64);
-+  TEST_VSXI(vsri, q, poly, p, 64, 2, 64);
-+
-+#define COMMENT "(max shift amount)"
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vsri_expected_max_shift, COMMENT);
-+  CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vsri_expected_max_shift, COMMENT);
-+
-+  /* vst1_lane_p64 tests.  */
-+#undef TEST_MSG
-+#define TEST_MSG "VST1_LANE/VST1_LANEQ"
-+
-+#define TEST_VST1_LANE(Q, T1, T2, W, N, L)				\
-+  VECT_VAR(vst1_lane_vector, T1, W, N) =				\
-+    vld1##Q##_##T2##W(VECT_VAR(buffer, T1, W, N));			\
-+  vst1##Q##_lane_##T2##W(VECT_VAR(result, T1, W, N),			\
-+			 VECT_VAR(vst1_lane_vector, T1, W, N), L);
-+
-+  DECL_VARIABLE(vst1_lane_vector, poly, 64, 1);
-+  DECL_VARIABLE(vst1_lane_vector, poly, 64, 2);
-+
-+  CLEAN(result, poly, 64, 1);
-+  CLEAN(result, poly, 64, 2);
-+
-+  TEST_VST1_LANE(, poly, p, 64, 1, 0);
-+  TEST_VST1_LANE(q, poly, p, 64, 2, 0);
-+
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vst1_lane_expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vst1_lane_expected, "");
-+
-+#ifdef __aarch64__
-+
-+  /* vmov_n_p64 tests.  */
-+#undef TEST_MSG
-+#define TEST_MSG "VMOV/VMOVQ"
-+
-+#define TEST_VMOV(Q, T1, T2, W, N)					\
-+  VECT_VAR(vmov_n_vector, T1, W, N) =					\
-+    vmov##Q##_n_##T2##W(VECT_VAR(buffer_dup, T1, W, N)[i]);		\
-+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vmov_n_vector, T1, W, N))
-+
-+  DECL_VARIABLE(vmov_n_vector, poly, 64, 1);
-+  DECL_VARIABLE(vmov_n_vector, poly, 64, 2);
-+
-+  /* Try to read different places from the input buffer.  */
-+  for (i=0; i< 3; i++) {
-+    CLEAN(result, poly, 64, 1);
-+    CLEAN(result, poly, 64, 2);
-+
-+    TEST_VMOV(, poly, p, 64, 1);
-+    TEST_VMOV(q, poly, p, 64, 2);
-+
-+    switch (i) {
-+    case 0:
-+      CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vmov_n_expected0, "");
-+      CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vmov_n_expected0, "");
-+      break;
-+    case 1:
-+      CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vmov_n_expected1, "");
-+      CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vmov_n_expected1, "");
-+      break;
-+    case 2:
-+      CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, vmov_n_expected2, "");
-+      CHECK_POLY(TEST_MSG, poly, 64, 2, PRIx64, vmov_n_expected2, "");
-+      break;
-+    default:
-+      abort();
-+    }
-+  }
-+
-+  /* vget_lane_p64 tests.  */
-+#undef TEST_MSG
-+#define TEST_MSG "VGET_LANE/VGETQ_LANE"
-+
-+#define TEST_VGET_LANE(Q, T1, T2, W, N, L)				   \
-+  VECT_VAR(vget_lane_vector, T1, W, N) = vget##Q##_lane_##T2##W(VECT_VAR(vector, T1, W, N), L); \
-+  if (VECT_VAR(vget_lane_vector, T1, W, N) != VECT_VAR(vget_lane_expected, T1, W, N)) {		\
-+    fprintf(stderr,							   \
-+	    "ERROR in %s (%s line %d in result '%s') at type %s "	   \
-+	    "got 0x%" PRIx##W " != 0x%" PRIx##W "\n",			   \
-+	    TEST_MSG, __FILE__, __LINE__,				   \
-+	    STR(VECT_VAR(vget_lane_expected, T1, W, N)),		   \
-+	    STR(VECT_NAME(T1, W, N)),					   \
-+	    (uint##W##_t)VECT_VAR(vget_lane_vector, T1, W, N),		   \
-+	    (uint##W##_t)VECT_VAR(vget_lane_expected, T1, W, N));	   \
-+    abort ();								   \
-+  }
-+
-+  /* Initialize input values.  */
-+  DECL_VARIABLE(vector, poly, 64, 1);
-+  DECL_VARIABLE(vector, poly, 64, 2);
-+
-+  VLOAD(vector, buffer,  , poly, p, 64, 1);
-+  VLOAD(vector, buffer, q, poly, p, 64, 2);
-+
-+  VECT_VAR_DECL(vget_lane_vector, poly, 64, 1);
-+  VECT_VAR_DECL(vget_lane_vector, poly, 64, 2);
-+
-+  TEST_VGET_LANE( , poly, p, 64, 1, 0);
-+  TEST_VGET_LANE(q, poly, p, 64, 2, 0);
-+
-+  /* vldx_lane_p64 tests.  */
-+#undef TEST_MSG
-+#define TEST_MSG "VLDX_LANE/VLDXQ_LANE"
-+
-+VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 64, 2);
-+VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 64, 3);
-+VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 64, 4);
-+
-+  /* In this case, input variables are arrays of vectors.  */
-+#define DECL_VLD_STX_LANE(T1, W, N, X)					\
-+  VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector, T1, W, N, X);	\
-+  VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector_src, T1, W, N, X);	\
-+  VECT_VAR_DECL(result_bis_##X, T1, W, N)[X * N]
-+
-+  /* We need to use a temporary result buffer (result_bis), because
-+     the one used for other tests is not large enough. A subset of the
-+     result data is moved from result_bis to result, and it is this
-+     subset which is used to check the actual behavior. The next
-+     macro enables to move another chunk of data from result_bis to
-+     result.  */
-+  /* We also use another extra input buffer (buffer_src), which we
-+     fill with 0xAA, and which it used to load a vector from which we
-+     read a given lane.  */
-+
-+#define TEST_VLDX_LANE(Q, T1, T2, W, N, X, L)				\
-+  memset (VECT_VAR(buffer_src, T1, W, N), 0xAA,				\
-+	  sizeof(VECT_VAR(buffer_src, T1, W, N)));			\
-+									\
-+  VECT_ARRAY_VAR(vector_src, T1, W, N, X) =				\
-+    vld##X##Q##_##T2##W(VECT_VAR(buffer_src, T1, W, N));		\
-+									\
-+  VECT_ARRAY_VAR(vector, T1, W, N, X) =					\
-+    /* Use dedicated init buffer, of size.  X */			\
-+    vld##X##Q##_lane_##T2##W(VECT_VAR(buffer_vld##X##_lane, T1, W, X),	\
-+			     VECT_ARRAY_VAR(vector_src, T1, W, N, X),	\
-+			     L);					\
-+  vst##X##Q##_##T2##W(VECT_VAR(result_bis_##X, T1, W, N),		\
-+		      VECT_ARRAY_VAR(vector, T1, W, N, X));		\
-+  memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(result_bis_##X, T1, W, N), \
-+	 sizeof(VECT_VAR(result, T1, W, N)))
-+
-+  /* Overwrite "result" with the contents of "result_bis"[Y].  */
-+#undef TEST_EXTRA_CHUNK
-+#define TEST_EXTRA_CHUNK(T1, W, N, X, Y)		\
-+  memcpy(VECT_VAR(result, T1, W, N),			\
-+	 &(VECT_VAR(result_bis_##X, T1, W, N)[Y*N]),	\
-+	 sizeof(VECT_VAR(result, T1, W, N)));
-+
-+  /* Add some padding to try to catch out of bound accesses.  */
-+#define ARRAY1(V, T, W, N) VECT_VAR_DECL(V,T,W,N)[1]={42}
-+#define DUMMY_ARRAY(V, T, W, N, L) \
-+  VECT_VAR_DECL(V,T,W,N)[N*L]={0}; \
-+  ARRAY1(V##_pad,T,W,N)
-+
-+#define DECL_ALL_VLD_STX_LANE(X)     \
-+  DECL_VLD_STX_LANE(poly, 64, 1, X); \
-+  DECL_VLD_STX_LANE(poly, 64, 2, X);
-+
-+#define TEST_ALL_VLDX_LANE(X)		  \
-+  TEST_VLDX_LANE(, poly, p, 64, 1, X, 0); \
-+  TEST_VLDX_LANE(q, poly, p, 64, 2, X, 0);
-+
-+#define TEST_ALL_EXTRA_CHUNKS(X,Y)	     \
-+  TEST_EXTRA_CHUNK(poly, 64, 1, X, Y) \
-+  TEST_EXTRA_CHUNK(poly, 64, 2, X, Y)
-+
-+#define CHECK_RESULTS_VLD_STX_LANE(test_name,EXPECTED,comment)		\
-+  CHECK_POLY(test_name, poly, 64, 1, PRIx64, EXPECTED, comment);	\
-+  CHECK_POLY(test_name, poly, 64, 2, PRIx64, EXPECTED, comment);
-+
-+  /* Declare the temporary buffers / variables.  */
-+  DECL_ALL_VLD_STX_LANE(2);
-+  DECL_ALL_VLD_STX_LANE(3);
-+  DECL_ALL_VLD_STX_LANE(4);
-+
-+  DUMMY_ARRAY(buffer_src, poly, 64, 1, 4);
-+  DUMMY_ARRAY(buffer_src, poly, 64, 2, 4);
-+
-+  /* Check vld2_lane/vld2q_lane.  */
-+  clean_results ();
-+#undef TEST_MSG
-+#define TEST_MSG "VLD2_LANE/VLD2Q_LANE"
-+  TEST_ALL_VLDX_LANE(2);
-+  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st2_0, " chunk 0");
-+
-+  TEST_ALL_EXTRA_CHUNKS(2, 1);
-+  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st2_1, " chunk 1");
-+
-+  /* Check vld3_lane/vld3q_lane.  */
-+  clean_results ();
-+#undef TEST_MSG
-+#define TEST_MSG "VLD3_LANE/VLD3Q_LANE"
-+  TEST_ALL_VLDX_LANE(3);
-+  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st3_0, " chunk 0");
-+
-+  TEST_ALL_EXTRA_CHUNKS(3, 1);
-+  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st3_1, " chunk 1");
-+
-+  TEST_ALL_EXTRA_CHUNKS(3, 2);
-+  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st3_2, " chunk 2");
-+
-+  /* Check vld4_lane/vld4q_lane.  */
-+  clean_results ();
-+#undef TEST_MSG
-+#define TEST_MSG "VLD4_LANE/VLD4Q_LANE"
-+  TEST_ALL_VLDX_LANE(4);
-+  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st4_0, " chunk 0");
-+
-+  TEST_ALL_EXTRA_CHUNKS(4, 1);
-+  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st4_1, " chunk 1");
-+
-+  TEST_ALL_EXTRA_CHUNKS(4, 2);
-+  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st4_2, " chunk 2");
-+
-+  TEST_ALL_EXTRA_CHUNKS(4, 3);
-+  CHECK_RESULTS_VLD_STX_LANE (TEST_MSG, expected_vld_st4_3, " chunk 3");
-+
-+  /* In this case, input variables are arrays of vectors.  */
-+#define DECL_VSTX_LANE(T1, W, N, X)					\
-+  VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector, T1, W, N, X);	\
-+  VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector_src, T1, W, N, X);	\
-+  VECT_VAR_DECL(result_bis_##X, T1, W, N)[X * N]
-+
-+  /* We need to use a temporary result buffer (result_bis), because
-+     the one used for other tests is not large enough. A subset of the
-+     result data is moved from result_bis to result, and it is this
-+     subset which is used to check the actual behavior. The next
-+     macro enables to move another chunk of data from result_bis to
-+     result.  */
-+  /* We also use another extra input buffer (buffer_src), which we
-+     fill with 0xAA, and which it used to load a vector from which we
-+     read a given lane.  */
-+#define TEST_VSTX_LANE(Q, T1, T2, W, N, X, L)				 \
-+  memset (VECT_VAR(buffer_src, T1, W, N), 0xAA,				 \
-+	  sizeof(VECT_VAR(buffer_src, T1, W, N)));			 \
-+  memset (VECT_VAR(result_bis_##X, T1, W, N), 0,			 \
-+	  sizeof(VECT_VAR(result_bis_##X, T1, W, N)));			 \
-+									 \
-+  VECT_ARRAY_VAR(vector_src, T1, W, N, X) =				 \
-+    vld##X##Q##_##T2##W(VECT_VAR(buffer_src, T1, W, N));		 \
-+									 \
-+  VECT_ARRAY_VAR(vector, T1, W, N, X) =					 \
-+    /* Use dedicated init buffer, of size X.  */			 \
-+    vld##X##Q##_lane_##T2##W(VECT_VAR(buffer_vld##X##_lane, T1, W, X),	 \
-+			     VECT_ARRAY_VAR(vector_src, T1, W, N, X),	 \
-+			     L);					 \
-+  vst##X##Q##_lane_##T2##W(VECT_VAR(result_bis_##X, T1, W, N),		 \
-+			   VECT_ARRAY_VAR(vector, T1, W, N, X),		 \
-+			   L);						 \
-+  memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(result_bis_##X, T1, W, N), \
-+	 sizeof(VECT_VAR(result, T1, W, N)));
-+
-+#define TEST_ALL_VSTX_LANE(X)		  \
-+  TEST_VSTX_LANE(, poly, p, 64, 1, X, 0); \
-+  TEST_VSTX_LANE(q, poly, p, 64, 2, X, 0);
-+
-+  /* Check vst2_lane/vst2q_lane.  */
-+  clean_results ();
-+#undef TEST_MSG
-+#define TEST_MSG "VST2_LANE/VST2Q_LANE"
-+  TEST_ALL_VSTX_LANE(2);
-+
-+#define CMT " (chunk 0)"
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st2_0, CMT);
-+
-+  TEST_ALL_EXTRA_CHUNKS(2, 1);
-+#undef CMT
-+#define CMT " chunk 1"
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st2_1, CMT);
-+
-+  /* Check vst3_lane/vst3q_lane.  */
-+  clean_results ();
-+#undef TEST_MSG
-+#define TEST_MSG "VST3_LANE/VST3Q_LANE"
-+  TEST_ALL_VSTX_LANE(3);
-+
-+#undef CMT
-+#define CMT " (chunk 0)"
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st3_0, CMT);
-+
-+  TEST_ALL_EXTRA_CHUNKS(3, 1);
-+
-+#undef CMT
-+#define CMT " (chunk 1)"
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st3_1, CMT);
-+
-+  TEST_ALL_EXTRA_CHUNKS(3, 2);
-+
-+#undef CMT
-+#define CMT " (chunk 2)"
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st3_2, CMT);
-+
-+  /* Check vst4_lane/vst4q_lane.  */
-+  clean_results ();
-+#undef TEST_MSG
-+#define TEST_MSG "VST4_LANE/VST4Q_LANE"
-+  TEST_ALL_VSTX_LANE(4);
-+
-+#undef CMT
-+#define CMT " (chunk 0)"
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st4_0, CMT);
-+
-+  TEST_ALL_EXTRA_CHUNKS(4, 1);
-+
-+#undef CMT
-+#define CMT " (chunk 1)"
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st4_1, CMT);
-+
-+  TEST_ALL_EXTRA_CHUNKS(4, 2);
-+
-+#undef CMT
-+#define CMT " (chunk 2)"
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st4_2, CMT);
-+
-+  TEST_ALL_EXTRA_CHUNKS(4, 3);
-+
-+#undef CMT
-+#define CMT " (chunk 3)"
-+  CHECK_POLY(TEST_MSG, poly, 64, 1, PRIx64, expected_vld_st4_3, CMT);
-+
-+#endif /* __aarch64__.  */
-+
-+  return 0;
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/ternary_scalar_op.inc
-@@ -0,0 +1,206 @@
-+/* Template file for ternary scalar operator validation.
-+
-+   This file is meant to be included by test files for binary scalar
-+   operations.  */
-+
-+/* Check for required settings.  */
-+
-+#ifndef INSN_NAME
-+#error INSN_NAME (the intrinsic to test) must be defined.
-+#endif
-+
-+#ifndef INPUT_TYPE
-+#error INPUT_TYPE (basic type of an input value) must be defined.
-+#endif
-+
-+#ifndef OUTPUT_TYPE
-+#error OUTPUT_TYPE (basic type of an output value) must be defined.
-+#endif
-+
-+#ifndef OUTPUT_TYPE_SIZE
-+#error OUTPUT_TYPE_SIZE (size in bits of an output value) must be defined.
-+#endif
-+
-+/* Optional settings:
-+
-+   INPUT_1: Input values for the first parameter.  Must be of type INPUT_TYPE.
-+   INPUT_2: Input values for the second parameter.  Must be of type INPUT_TYPE.
-+   INPUT_3: Input values for the third parameter.  Must be of type
-+   INPUT_TYPE.  */
-+
-+#ifndef TEST_MSG
-+#define TEST_MSG "unnamed test"
-+#endif
-+
-+/* The test framework.  */
-+
-+#include <stdio.h>
-+
-+extern void abort ();
-+
-+#define INFF __builtin_inf ()
-+
-+/* Stringify a macro.  */
-+#define STR0(A) #A
-+#define STR(A) STR0 (A)
-+
-+/* Macro concatenation.  */
-+#define CAT0(A, B) A##B
-+#define CAT(A, B) CAT0 (A, B)
-+
-+/* Format strings for error reporting.  */
-+#define FMT16 "0x%04x"
-+#define FMT32 "0x%08x"
-+#define FMT CAT (FMT,OUTPUT_TYPE_SIZE)
-+
-+/* Type construction: forms TS_t, where T is the base type and S the size in
-+   bits.  */
-+#define MK_TYPE0(T, S) T##S##_t
-+#define MK_TYPE(T, S) MK_TYPE0 (T, S)
-+
-+/* Convenience types for input and output data.  */
-+typedef MK_TYPE (uint, OUTPUT_TYPE_SIZE) output_hex_type;
-+
-+/* Conversion between typed values and their hexadecimal representation.  */
-+typedef union
-+{
-+  OUTPUT_TYPE value;
-+  output_hex_type hex;
-+} output_conv_type;
-+
-+/* Default input values.  */
-+
-+float16_t input_1_float16_t[] =
-+{
-+  0.0,
-+  -0.0,
-+  2.0,
-+  3.1,
-+  20.0,
-+  0.40,
-+  -2.3,
-+  1.33,
-+  -7.6,
-+  0.31,
-+  0.3353,
-+  0.5,
-+  1.0,
-+  13.13,
-+  -6.3,
-+  20.0,
-+  (float16_t)INFF,
-+  (float16_t)-INFF,
-+};
-+
-+float16_t input_2_float16_t[] =
-+{
-+  1.0,
-+  1.0,
-+  -4.33,
-+  100.0,
-+  30.0,
-+  -0.02,
-+  0.5,
-+  -7.231,
-+  -6.3,
-+  20.0,
-+  -7.231,
-+  2.3,
-+  -7.6,
-+  5.1,
-+  0.31,
-+  0.33353,
-+  (float16_t)-INFF,
-+  (float16_t)INFF,
-+};
-+
-+float16_t input_3_float16_t[] =
-+{
-+  -0.0,
-+  0.0,
-+  0.31,
-+  -0.31,
-+  1.31,
-+  2.1,
-+  -6.3,
-+  1.0,
-+  -1.5,
-+  5.1,
-+  0.3353,
-+  9.3,
-+  -9.3,
-+  -7.231,
-+  0.5,
-+  -0.33,
-+  (float16_t)INFF,
-+  (float16_t)INFF,
-+};
-+
-+#ifndef INPUT_1
-+#define INPUT_1 CAT (input_1_,INPUT_TYPE)
-+#endif
-+
-+#ifndef INPUT_2
-+#define INPUT_2 CAT (input_2_,INPUT_TYPE)
-+#endif
-+
-+#ifndef INPUT_3
-+#define INPUT_3 CAT (input_3_,INPUT_TYPE)
-+#endif
-+
-+/* Support macros and routines for the test function.  */
-+
-+#define CHECK()							\
-+  {								\
-+    output_conv_type actual;					\
-+    output_conv_type expect;					\
-+								\
-+    expect.hex = ((output_hex_type*)EXPECTED)[index];		\
-+    actual.value = INSN_NAME ((INPUT_1)[index],			\
-+			      (INPUT_2)[index],			\
-+			      (INPUT_3)[index]);		\
-+								\
-+    if (actual.hex != expect.hex)				\
-+      {								\
-+	fprintf (stderr,					\
-+		 "ERROR in %s (%s line %d), buffer %s, "	\
-+		 "index %d: got "				\
-+		 FMT " != " FMT "\n",				\
-+		 TEST_MSG, __FILE__, __LINE__,			\
-+		 STR (EXPECTED), index,				\
-+		 actual.hex, expect.hex);			\
-+	abort ();						\
-+      }								\
-+    fprintf (stderr, "CHECKED %s %s\n",				\
-+	     STR (EXPECTED), TEST_MSG);				\
-+  }
-+
-+#define FNNAME1(NAME) exec_ ## NAME
-+#define FNNAME(NAME) FNNAME1 (NAME)
-+
-+/* The test function.  */
-+
-+void
-+FNNAME (INSN_NAME) (void)
-+{
-+  /* Basic test: y[i] = OP (x[i]), for each INPUT[i], then compare the result
-+     against EXPECTED[i].  */
-+
-+  const int num_tests = sizeof (INPUT_1) / sizeof (INPUT_1[0]);
-+  int index;
-+
-+  for (index = 0; index < num_tests; index++)
-+    CHECK ();
-+
-+#ifdef EXTRA_TESTS
-+  EXTRA_TESTS ();
-+#endif
-+}
-+
-+int
-+main (void)
-+{
-+  FNNAME (INSN_NAME) ();
-+
-+  return 0;
-+}
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/unary_sat_op.inc
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/unary_sat_op.inc
-@@ -61,11 +61,11 @@ void FNNAME (INSN_NAME) (void)
-   TEST_UNARY_SAT_OP(INSN_NAME, q, int, s, 32, 4, expected_cumulative_sat, "");
- 
-   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected, "");
--  CHECK(TEST_MSG, int, 16, 4, PRIx8, expected, "");
--  CHECK(TEST_MSG, int, 32, 2, PRIx8, expected, "");
-+  CHECK(TEST_MSG, int, 16, 4, PRIx16, expected, "");
-+  CHECK(TEST_MSG, int, 32, 2, PRIx32, expected, "");
-   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected, "");
--  CHECK(TEST_MSG, int, 16, 8, PRIx8, expected, "");
--  CHECK(TEST_MSG, int, 32, 4, PRIx8, expected, "");
-+  CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
-+  CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
- 
- #ifdef EXTRA_TESTS
-   EXTRA_TESTS();
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/unary_scalar_op.inc
-@@ -0,0 +1,200 @@
-+/* Template file for unary scalar operator validation.
-+
-+   This file is meant to be included by test files for unary scalar
-+   operations.  */
-+
-+/* Check for required settings.  */
-+
-+#ifndef INSN_NAME
-+#error INSN_NAME (the intrinsic to test) must be defined.
-+#endif
-+
-+#ifndef INPUT_TYPE
-+#error INPUT_TYPE (basic type of an input value) must be defined.
-+#endif
-+
-+#ifndef SCALAR_OPERANDS
-+#ifndef EXPECTED
-+#error EXPECTED (an array of expected output values) must be defined.
-+#endif
-+#endif
-+
-+#ifndef OUTPUT_TYPE
-+#error OUTPUT_TYPE (basic type of an output value) must be defined.
-+#endif
-+
-+#ifndef OUTPUT_TYPE_SIZE
-+#error OUTPUT_TYPE_SIZE (size in bits of an output value) must be defined.
-+#endif
-+
-+/* Optional settings.  */
-+
-+/* SCALAR_OPERANDS: Defined iff the intrinsic has a scalar operand.
-+
-+   SCALAR_1, SCALAR_2, .., SCALAR_4: If SCALAR_OPERANDS is defined, SCALAR_<n>
-+   is the scalar and EXPECTED_<n> is array of expected values.
-+
-+   INPUT: Input values for the first parameter.  Must be of type INPUT_TYPE.  */
-+
-+/* Additional comments for the error message.  */
-+#ifndef COMMENT
-+#define COMMENT ""
-+#endif
-+
-+#ifndef TEST_MSG
-+#define TEST_MSG "unnamed test"
-+#endif
-+
-+/* The test framework.  */
-+
-+#include <stdio.h>
-+
-+extern void abort ();
-+
-+#define INFF __builtin_inf ()
-+
-+/* Stringify a macro.  */
-+#define STR0(A) #A
-+#define STR(A) STR0 (A)
-+
-+/* Macro concatenation.  */
-+#define CAT0(A, B) A##B
-+#define CAT(A, B) CAT0 (A, B)
-+
-+/* Format strings for error reporting.  */
-+#define FMT16 "0x%04x"
-+#define FMT32 "0x%08x"
-+#define FMT64 "0x%016x"
-+#define FMT CAT (FMT,OUTPUT_TYPE_SIZE)
-+
-+/* Type construction: forms TS_t, where T is the base type and S the size in
-+   bits.  */
-+#define MK_TYPE0(T, S) T##S##_t
-+#define MK_TYPE(T, S) MK_TYPE0 (T, S)
-+
-+/* Convenience types for input and output data.  */
-+typedef MK_TYPE (uint, OUTPUT_TYPE_SIZE) output_hex_type;
-+
-+/* Conversion between typed values and their hexadecimal representation.  */
-+typedef union
-+{
-+  OUTPUT_TYPE value;
-+  output_hex_type hex;
-+} output_conv_type;
-+
-+/* Default input values.  */
-+
-+float16_t input_1_float16_t[] =
-+{
-+  0.0, -0.0,
-+  2.0, 3.1,
-+  20.0, 0.40,
-+  -2.3, 1.33,
-+  -7.6, 0.31,
-+  0.3353, 0.5,
-+  1.0, 13.13,
-+  -6.3, 20.0,
-+  (float16_t)INFF, (float16_t)-INFF,
-+};
-+
-+#ifndef INPUT
-+#define INPUT CAT(input_1_,INPUT_TYPE)
-+#endif
-+
-+/* Support macros and routines for the test function.  */
-+
-+#define CHECK()							\
-+  {								\
-+    output_conv_type actual;					\
-+    output_conv_type expect;					\
-+								\
-+    expect.hex = ((output_hex_type*)EXPECTED)[index];		\
-+    actual.value = INSN_NAME ((INPUT)[index]);			\
-+								\
-+    if (actual.hex != expect.hex)				\
-+      {								\
-+	fprintf (stderr,					\
-+		 "ERROR in %s (%s line %d), buffer %s, "	\
-+		 "index %d: got "				\
-+		 FMT " != " FMT "\n",				\
-+		 TEST_MSG, __FILE__, __LINE__,			\
-+		 STR (EXPECTED), index,				\
-+		 actual.hex, expect.hex);			\
-+	abort ();						\
-+      }								\
-+    fprintf (stderr, "CHECKED %s %s\n",				\
-+	     STR (EXPECTED), TEST_MSG);				\
-+  }
-+
-+#define CHECK_N(SCALAR, EXPECTED)				\
-+  {								\
-+    output_conv_type actual;					\
-+    output_conv_type expect;					\
-+								\
-+    expect.hex							\
-+      = ((output_hex_type*)EXPECTED)[index];			\
-+    actual.value = INSN_NAME ((INPUT)[index], (SCALAR));	\
-+								\
-+    if (actual.hex != expect.hex)				\
-+      {								\
-+	fprintf (stderr,					\
-+		 "ERROR in %s (%s line %d), buffer %s, "	\
-+		 "index %d: got "				\
-+		 FMT " != " FMT "\n",				\
-+		 TEST_MSG, __FILE__, __LINE__,			\
-+		 STR (EXPECTED), index,				\
-+		 actual.hex, expect.hex);			\
-+	abort ();						\
-+      }								\
-+    fprintf (stderr, "CHECKED %s %s\n",				\
-+	     STR (EXPECTED), TEST_MSG);				\
-+  }
-+
-+#define FNNAME1(NAME) exec_ ## NAME
-+#define FNNAME(NAME) FNNAME1 (NAME)
-+
-+/* The test function.  */
-+
-+void
-+FNNAME (INSN_NAME) (void)
-+{
-+  /* Basic test: y[i] = OP (x[i]), for each INPUT[i], then compare the result
-+     against EXPECTED[i].  */
-+
-+  const int num_tests = sizeof (INPUT) / sizeof (INPUT[0]);
-+  int index;
-+
-+  for (index = 0; index < num_tests; index++)
-+    {
-+#if defined (SCALAR_OPERANDS)
-+
-+#ifdef SCALAR_1
-+      CHECK_N (SCALAR_1, EXPECTED_1);
-+#endif
-+#ifdef SCALAR_2
-+      CHECK_N (SCALAR_2, EXPECTED_2);
-+#endif
-+#ifdef SCALAR_3
-+      CHECK_N (SCALAR_3, EXPECTED_3);
-+#endif
-+#ifdef SCALAR_4
-+      CHECK_N (SCALAR_4, EXPECTED_4);
-+#endif
-+
-+#else /* !defined (SCALAR_OPERAND).  */
-+      CHECK ();
-+#endif
-+    }
-+
-+#ifdef EXTRA_TESTS
-+  EXTRA_TESTS ();
-+#endif
-+}
-+
-+int
-+main (void)
-+{
-+  FNNAME (INSN_NAME) ();
-+
-+  return 0;
-+}
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabd.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabd.c
-@@ -30,10 +30,20 @@ VECT_VAR_DECL(expected,uint,32,4) [] = { 0xffffffd0, 0xffffffd1,
- 					 0xffffffd2, 0xffffffd3 };
- VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x42407ae1, 0x423c7ae1,
- 					   0x42387ae1, 0x42347ae1 };
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0x4e13, 0x4dd3,
-+					      0x4d93, 0x4d53 };
-+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0x5204, 0x51e4, 0x51c4, 0x51a4,
-+					      0x5184, 0x5164, 0x5144, 0x5124 };
-+#endif
- 
- /* Additional expected results for float32 variants with specially
-    chosen input values.  */
- VECT_VAR_DECL(expected_float32,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_float16, hfloat, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0,
-+						      0x0, 0x0, 0x0, 0x0 };
-+#endif
- 
- #define TEST_MSG "VABD/VABDQ"
- void exec_vabd (void)
-@@ -65,6 +75,17 @@ void exec_vabd (void)
-   DECL_VABD_VAR(vector2);
-   DECL_VABD_VAR(vector_res);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector1, float, 16, 4);
-+  DECL_VARIABLE(vector1, float, 16, 8);
-+
-+  DECL_VARIABLE(vector2, float, 16, 4);
-+  DECL_VARIABLE(vector2, float, 16, 8);
-+
-+  DECL_VARIABLE(vector_res, float, 16, 4);
-+  DECL_VARIABLE(vector_res, float, 16, 8);
-+#endif
-+
-   clean_results ();
- 
-   /* Initialize input "vector1" from "buffer".  */
-@@ -82,6 +103,12 @@ void exec_vabd (void)
-   VLOAD(vector1, buffer, q, uint, u, 16, 8);
-   VLOAD(vector1, buffer, q, uint, u, 32, 4);
-   VLOAD(vector1, buffer, q, float, f, 32, 4);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VLOAD(vector1, buffer, , float, f, 16, 4);
-+  VLOAD(vector1, buffer, , float, f, 16, 4);
-+  VLOAD(vector1, buffer, q, float, f, 16, 8);
-+  VLOAD(vector1, buffer, q, float, f, 16, 8);
-+#endif
- 
-   /* Choose init value arbitrarily.  */
-   VDUP(vector2, , int, s, 8, 8, 1);
-@@ -98,6 +125,10 @@ void exec_vabd (void)
-   VDUP(vector2, q, uint, u, 16, 8, 12);
-   VDUP(vector2, q, uint, u, 32, 4, 32);
-   VDUP(vector2, q, float, f, 32, 4, 32.12f);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector2, , float, f, 16, 4, 8.3f);
-+  VDUP(vector2, q, float, f, 16, 8, 32.12f);
-+#endif
- 
-   /* Execute the tests.  */
-   TEST_VABD(, int, s, 8, 8);
-@@ -115,6 +146,11 @@ void exec_vabd (void)
-   TEST_VABD(q, uint, u, 32, 4);
-   TEST_VABD(q, float, f, 32, 4);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VABD(, float, f, 16, 4);
-+  TEST_VABD(q, float, f, 16, 8);
-+#endif
-+
-   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected, "");
-   CHECK(TEST_MSG, int, 16, 4, PRIx16, expected, "");
-   CHECK(TEST_MSG, int, 32, 2, PRIx32, expected, "");
-@@ -129,7 +165,10 @@ void exec_vabd (void)
-   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected, "");
--
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, "");
-+#endif
- 
-   /* Extra FP tests with special values (-0.0, ....) */
-   VDUP(vector1, q, float, f, 32, 4, -0.0f);
-@@ -137,11 +176,27 @@ void exec_vabd (void)
-   TEST_VABD(q, float, f, 32, 4);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_float32, " FP special (-0.0)");
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector1, q, float, f, 16, 8, -0.0f);
-+  VDUP(vector2, q, float, f, 16, 8, 0.0);
-+  TEST_VABD(q, float, f, 16, 8);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_float16,
-+	   " FP special (-0.0)");
-+#endif
-+
-   /* Extra FP tests with special values (-0.0, ....) */
-   VDUP(vector1, q, float, f, 32, 4, 0.0f);
-   VDUP(vector2, q, float, f, 32, 4, -0.0);
-   TEST_VABD(q, float, f, 32, 4);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_float32, " FP special (-0.0)");
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector1, q, float, f, 16, 8, 0.0f);
-+  VDUP(vector2, q, float, f, 16, 8, -0.0);
-+  TEST_VABD(q, float, f, 16, 8);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_float16,
-+	   " FP special (-0.0)");
-+#endif
- }
- 
- int main (void)
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabdh_f16_1.c
-@@ -0,0 +1,44 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+#define INFF __builtin_inf ()
-+
-+/* Expected results.
-+   Absolute difference between INPUT1 and INPUT2 in binary_scalar_op.inc.  */
-+uint16_t expected[] =
-+{
-+  0x3C00,
-+  0x3C00,
-+  0x4654,
-+  0x560E,
-+  0x4900,
-+  0x36B8,
-+  0x419a,
-+  0x4848,
-+  0x3d34,
-+  0x4cec,
-+  0x4791,
-+  0x3f34,
-+  0x484d,
-+  0x4804,
-+  0x469c,
-+  0x4ceb,
-+  0x7c00,
-+  0x7c00
-+};
-+
-+#define TEST_MSG "VABDH_F16"
-+#define INSN_NAME vabdh_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "binary_scalar_op.inc"
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabs.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabs.c
-@@ -21,24 +21,52 @@ VECT_VAR_DECL(expected,int,32,4) [] = { 0x10, 0xf, 0xe, 0xd };
- /* Expected results for float32 variants. Needs to be separated since
-    the generic test function does not test floating-point
-    versions.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_float16, hfloat, 16, 4) [] = { 0x409a, 0x409a,
-+						      0x409a, 0x409a };
-+VECT_VAR_DECL(expected_float16, hfloat, 16, 8) [] = { 0x42cd, 0x42cd,
-+						      0x42cd, 0x42cd,
-+						      0x42cd, 0x42cd,
-+						      0x42cd, 0x42cd };
-+#endif
- VECT_VAR_DECL(expected_float32,hfloat,32,2) [] = { 0x40133333, 0x40133333 };
- VECT_VAR_DECL(expected_float32,hfloat,32,4) [] = { 0x4059999a, 0x4059999a,
- 						   0x4059999a, 0x4059999a };
- 
- void exec_vabs_f32(void)
- {
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector, float, 16, 4);
-+  DECL_VARIABLE(vector, float, 16, 8);
-+#endif
-   DECL_VARIABLE(vector, float, 32, 2);
-   DECL_VARIABLE(vector, float, 32, 4);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector_res, float, 16, 4);
-+  DECL_VARIABLE(vector_res, float, 16, 8);
-+#endif
-   DECL_VARIABLE(vector_res, float, 32, 2);
-   DECL_VARIABLE(vector_res, float, 32, 4);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, , float, f, 16, 4, -2.3f);
-+  VDUP(vector, q, float, f, 16, 8, 3.4f);
-+#endif
-   VDUP(vector, , float, f, 32, 2, -2.3f);
-   VDUP(vector, q, float, f, 32, 4, 3.4f);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_UNARY_OP(INSN_NAME, , float, f, 16, 4);
-+  TEST_UNARY_OP(INSN_NAME, q, float, f, 16, 8);
-+#endif
-   TEST_UNARY_OP(INSN_NAME, , float, f, 32, 2);
-   TEST_UNARY_OP(INSN_NAME, q, float, f, 32, 4);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_float16, "");
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_float16, "");
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_float32, "");
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_float32, "");
- }
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vabsh_f16_1.c
-@@ -0,0 +1,40 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected[] =
-+{
-+  0x0000 /* 0.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x4000 /* 2.000000 */,
-+  0x4233 /* 3.099609 */,
-+  0x4d00 /* 20.000000 */,
-+  0x3666 /* 0.399902 */,
-+  0x409a /* 2.300781 */,
-+  0x3d52 /* 1.330078 */,
-+  0x479a /* 7.601562 */,
-+  0x34f6 /* 0.310059 */,
-+  0x355d /* 0.335205 */,
-+  0x3800 /* 0.500000 */,
-+  0x3c00 /* 1.000000 */,
-+  0x4a91 /* 13.132812 */,
-+  0x464d /* 6.300781 */,
-+  0x4d00 /* 20.000000 */,
-+  0x7c00 /* inf */,
-+  0x7c00 /* inf */
-+};
-+
-+#define TEST_MSG "VABSH_F16"
-+#define INSN_NAME vabsh_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vadd.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vadd.c
-@@ -43,6 +43,14 @@ VECT_VAR_DECL(expected,uint,64,2) [] = { 0xfffffffffffffff3,
- VECT_VAR_DECL(expected_float32,hfloat,32,2) [] = { 0x40d9999a, 0x40d9999a };
- VECT_VAR_DECL(expected_float32,hfloat,32,4) [] = { 0x41100000, 0x41100000,
- 						   0x41100000, 0x41100000 };
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_float16, hfloat, 16, 4) [] = { 0x46cd, 0x46cd,
-+						      0x46cd, 0x46cd };
-+VECT_VAR_DECL(expected_float16, hfloat, 16, 8) [] = { 0x4880, 0x4880,
-+						      0x4880, 0x4880,
-+						      0x4880, 0x4880,
-+						      0x4880, 0x4880 };
-+#endif
- 
- void exec_vadd_f32(void)
- {
-@@ -66,4 +74,27 @@ void exec_vadd_f32(void)
- 
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_float32, "");
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_float32, "");
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector, float, 16, 4);
-+  DECL_VARIABLE(vector, float, 16, 8);
-+
-+  DECL_VARIABLE(vector2, float, 16, 4);
-+  DECL_VARIABLE(vector2, float, 16, 8);
-+
-+  DECL_VARIABLE(vector_res, float, 16, 4);
-+  DECL_VARIABLE(vector_res, float, 16, 8);
-+
-+  VDUP(vector, , float, f, 16, 4, 2.3f);
-+  VDUP(vector, q, float, f, 16, 8, 3.4f);
-+
-+  VDUP(vector2, , float, f, 16, 4, 4.5f);
-+  VDUP(vector2, q, float, f, 16, 8, 5.6f);
-+
-+  TEST_BINARY_OP(INSN_NAME, , float, f, 16, 4);
-+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
-+
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_float16, "");
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_float16, "");
-+#endif
- }
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vaddh_f16_1.c
-@@ -0,0 +1,40 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected[] =
-+{
-+  0x3c00 /* 1.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0xc0a8 /* -2.328125 */,
-+  0x5672 /* 103.125000 */,
-+  0x5240 /* 50.000000 */,
-+  0x3614 /* 0.379883 */,
-+  0xbf34 /* -1.800781 */,
-+  0xc5e6 /* -5.898438 */,
-+  0xcaf4 /* -13.906250 */,
-+  0x4d14 /* 20.312500 */,
-+  0xc6e5 /* -6.894531 */,
-+  0x419a /* 2.800781 */,
-+  0xc69a /* -6.601562 */,
-+  0x4c8f /* 18.234375 */,
-+  0xc5fe /* -5.992188 */,
-+  0x4d15 /* 20.328125 */,
-+  0x7e00 /* nan */,
-+  0x7e00 /* nan */,
-+};
-+
-+#define TEST_MSG "VADDH_F16"
-+#define INSN_NAME vaddh_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "binary_scalar_op.inc"
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbsl.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbsl.c
-@@ -16,6 +16,10 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffff1 };
- VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf3, 0xf3, 0xf3, 0xf3,
- 					0xf7, 0xf7, 0xf7, 0xf7 };
- VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff0, 0xfff0, 0xfff2, 0xfff2 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcc09, 0xcb89,
-+					       0xcb09, 0xca89 };
-+#endif
- VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800004, 0xc1700004 };
- VECT_VAR_DECL(expected,int,8,16) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
- 					0xf6, 0xf6, 0xf6, 0xf6,
-@@ -43,6 +47,12 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf3, 0xf3, 0xf3, 0xf3,
- 					 0xf7, 0xf7, 0xf7, 0xf7 };
- VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff0, 0xfff0, 0xfff2, 0xfff2,
- 					 0xfff4, 0xfff4, 0xfff6, 0xfff6 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xcc09, 0xcb89,
-+					       0xcb09, 0xca89,
-+					       0xca09, 0xc989,
-+					       0xc909, 0xc889 };
-+#endif
- VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1800001, 0xc1700001,
- 					   0xc1600001, 0xc1500001 };
- 
-@@ -66,6 +76,10 @@ void exec_vbsl (void)
-   clean_results ();
- 
-   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
-+#if defined (FP16_SUPPORTED)
-+  VLOAD(vector, buffer, , float, f, 16, 4);
-+  VLOAD(vector, buffer, q, float, f, 16, 8);
-+#endif
-   VLOAD(vector, buffer, , float, f, 32, 2);
-   VLOAD(vector, buffer, q, float, f, 32, 4);
- 
-@@ -80,6 +94,9 @@ void exec_vbsl (void)
-   VDUP(vector2, , uint, u, 16, 4, 0xFFF2);
-   VDUP(vector2, , uint, u, 32, 2, 0xFFFFFFF0);
-   VDUP(vector2, , uint, u, 64, 1, 0xFFFFFFF3);
-+#if defined (FP16_SUPPORTED)
-+  VDUP(vector2, , float, f, 16, 4, -2.4f);   /* -2.4f is 0xC0CD.  */
-+#endif
-   VDUP(vector2, , float, f, 32, 2, -30.3f);
-   VDUP(vector2, , poly, p, 8, 8, 0xF3);
-   VDUP(vector2, , poly, p, 16, 4, 0xFFF2);
-@@ -94,6 +111,9 @@ void exec_vbsl (void)
-   VDUP(vector2, q, uint, u, 64, 2, 0xFFFFFFF3);
-   VDUP(vector2, q, poly, p, 8, 16, 0xF3);
-   VDUP(vector2, q, poly, p, 16, 8, 0xFFF2);
-+#if defined (FP16_SUPPORTED)
-+  VDUP(vector2, q, float, f, 16, 8, -2.4f);
-+#endif
-   VDUP(vector2, q, float, f, 32, 4, -30.4f);
- 
-   VDUP(vector_first, , uint, u, 8, 8, 0xF4);
-@@ -111,10 +131,18 @@ void exec_vbsl (void)
-   TEST_VBSL(uint, , poly, p, 16, 4);
-   TEST_VBSL(uint, q, poly, p, 8, 16);
-   TEST_VBSL(uint, q, poly, p, 16, 8);
-+#if defined (FP16_SUPPORTED)
-+  TEST_VBSL(uint, , float, f, 16, 4);
-+  TEST_VBSL(uint, q, float, f, 16, 8);
-+#endif
-   TEST_VBSL(uint, , float, f, 32, 2);
-   TEST_VBSL(uint, q, float, f, 32, 4);
- 
-+#if defined (FP16_SUPPORTED)
-+  CHECK_RESULTS (TEST_MSG, "");
-+#else
-   CHECK_RESULTS_NO_FP16 (TEST_MSG, "");
-+#endif
- }
- 
- int main (void)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcage.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcage.c
-@@ -11,3 +11,13 @@ VECT_VAR_DECL(expected,uint,32,4) [] = { 0xffffffff, 0xffffffff,
- VECT_VAR_DECL(expected2,uint,32,2) [] = { 0xffffffff, 0xffffffff };
- VECT_VAR_DECL(expected2,uint,32,4) [] = { 0xffffffff, 0xffffffff,
- 					  0xffffffff, 0xffffffff };
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected, uint, 16, 4) [] = { 0xffff, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected, uint, 16, 8) [] = { 0xffff, 0xffff, 0xffff, 0x0,
-+					     0x0, 0x0, 0x0, 0x0 };
-+
-+VECT_VAR_DECL (expected2, uint, 16, 4) [] = { 0xffff, 0xffff, 0xffff, 0xffff };
-+VECT_VAR_DECL (expected2, uint, 16, 8) [] = { 0xffff, 0xffff, 0xffff, 0xffff,
-+					      0xffff, 0xffff, 0xffff, 0x0 };
-+#endif
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcageh_f16_1.c
-@@ -0,0 +1,22 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+uint16_t expected[] = { 0x0, 0x0, 0x0, 0x0, 0x0, 0xFFFF, 0xFFFF, 0x0, 0xFFFF,
-+			0x0, 0x0, 0x0, 0x0, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
-+			0xFFFF};
-+
-+#define TEST_MSG "VCAGEH_F16"
-+#define INSN_NAME vcageh_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "binary_scalar_op.inc"
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcagt.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcagt.c
-@@ -11,3 +11,13 @@ VECT_VAR_DECL(expected,uint,32,4) [] = { 0xffffffff, 0xffffffff,
- VECT_VAR_DECL(expected2,uint,32,2) [] = { 0xffffffff, 0xffffffff };
- VECT_VAR_DECL(expected2,uint,32,4) [] = { 0xffffffff, 0xffffffff,
- 					  0xffffffff, 0xffffffff };
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected, uint, 16, 8) [] = { 0xffff, 0xffff, 0x0, 0x0,
-+					     0x0, 0x0, 0x0, 0x0 };
-+
-+VECT_VAR_DECL (expected2, uint, 16, 4) [] = { 0xffff, 0xffff, 0xffff, 0xffff };
-+VECT_VAR_DECL (expected2, uint, 16, 8) [] = { 0xffff, 0xffff, 0xffff, 0xffff,
-+					      0xffff, 0xffff, 0x0, 0x0 };
-+#endif
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcagth_f16_1.c
-@@ -0,0 +1,21 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+uint16_t expected[] = { 0x0, 0x0, 0x0, 0x0, 0x0, 0xFFFF, 0xFFFF, 0x0, 0xFFFF,
-+			0x0, 0x0, 0x0, 0x0, 0xFFFF, 0xFFFF, 0xFFFF, 0x0, 0x0};
-+
-+#define TEST_MSG "VCAGTH_F16"
-+#define INSN_NAME vcagth_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "binary_scalar_op.inc"
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcale.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcale.c
-@@ -9,3 +9,13 @@ VECT_VAR_DECL(expected,uint,32,4) [] = { 0x0, 0x0, 0xffffffff, 0xffffffff };
- 
- VECT_VAR_DECL(expected2,uint,32,2) [] = { 0x0, 0x0 };
- VECT_VAR_DECL(expected2,uint,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected, uint, 16, 4) [] = { 0xffff, 0xffff, 0xffff, 0xffff };
-+VECT_VAR_DECL (expected, uint, 16, 8) [] = { 0x0, 0x0, 0xffff, 0xffff,
-+					     0xffff, 0xffff, 0xffff, 0xffff };
-+
-+VECT_VAR_DECL (expected2, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected2, uint, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0,
-+					      0x0, 0x0, 0xffff, 0xffff };
-+#endif
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcaleh_f16_1.c
-@@ -0,0 +1,22 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+uint16_t expected[] = { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0, 0x0,
-+			0xFFFF, 0x0, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0, 0x0,
-+			0x0, 0xFFFF, 0xFFFF};
-+
-+#define TEST_MSG "VCALEH_F16"
-+#define INSN_NAME vcaleh_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "binary_scalar_op.inc"
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcalt.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcalt.c
-@@ -9,3 +9,13 @@ VECT_VAR_DECL(expected,uint,32,4) [] = { 0x0, 0x0, 0x0, 0xffffffff };
- 
- VECT_VAR_DECL(expected2,uint,32,2) [] = { 0x0, 0x0 };
- VECT_VAR_DECL(expected2,uint,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected, uint, 16, 4) [] = { 0x0, 0xffff, 0xffff, 0xffff };
-+VECT_VAR_DECL (expected, uint, 16, 8) [] = { 0x0, 0x0, 0x0, 0xffff,
-+					     0xffff, 0xffff, 0xffff, 0xffff };
-+
-+VECT_VAR_DECL (expected2, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected2, uint, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0,
-+					      0x0, 0x0, 0x0, 0xffff };
-+#endif
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcalth_f16_1.c
-@@ -0,0 +1,22 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+uint16_t expected[] = { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0, 0x0,
-+			0xFFFF, 0x0, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0, 0x0,
-+			0x0, 0x0, 0x0};
-+
-+#define TEST_MSG "VCALTH_F16"
-+#define INSN_NAME vcalth_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "binary_scalar_op.inc"
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vceq.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vceq.c
-@@ -32,6 +32,12 @@ VECT_VAR_DECL(expected_q_uint,uint,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
- 						0x0, 0x0, 0xffff, 0x0 };
- VECT_VAR_DECL(expected_q_uint,uint,32,4) [] = { 0x0, 0x0, 0xffffffff, 0x0 };
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected_float, uint, 16, 4) [] = { 0x0, 0xffff, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_q_float, uint, 16, 8) [] = { 0x0, 0x0, 0xffff, 0x0,
-+						     0x0, 0x0, 0x0, 0x0, };
-+#endif
-+
- VECT_VAR_DECL(expected_float,uint,32,2) [] = { 0x0, 0xffffffff };
- VECT_VAR_DECL(expected_q_float,uint,32,4) [] = { 0x0, 0x0, 0xffffffff, 0x0 };
- 
-@@ -39,6 +45,18 @@ VECT_VAR_DECL(expected_uint2,uint,32,2) [] = { 0xffffffff, 0x0 };
- VECT_VAR_DECL(expected_uint3,uint,32,2) [] = { 0x0, 0xffffffff };
- VECT_VAR_DECL(expected_uint4,uint,32,2) [] = { 0xffffffff, 0x0 };
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected_nan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
-+VECT_VAR_DECL (expected_mnan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_nan2, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+
-+VECT_VAR_DECL (expected_inf, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_minf, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_inf2, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_mzero, uint, 16, 4) [] = { 0xffff, 0xffff,
-+						   0xffff, 0xffff };
-+#endif
-+
- VECT_VAR_DECL(expected_nan,uint,32,2) [] = { 0x0, 0x0 };
- VECT_VAR_DECL(expected_mnan,uint,32,2) [] = { 0x0, 0x0 };
- VECT_VAR_DECL(expected_nan2,uint,32,2) [] = { 0x0, 0x0 };
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vceqh_f16_1.c
-@@ -0,0 +1,21 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+uint16_t expected[] = { 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
-+			0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
-+
-+#define TEST_MSG "VCEQH_F16"
-+#define INSN_NAME vceqh_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "binary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vceqz_1.c
-@@ -0,0 +1,27 @@
-+/* This file tests an intrinsic which currently has only an f16 variant and that
-+   is only available when FP16 arithmetic instructions are supported.  */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
-+
-+#define INSN_NAME vceqz
-+#define TEST_MSG "VCEQZ/VCEQZQ"
-+
-+#include "cmp_zero_op.inc"
-+
-+/* Expected results.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected_float, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
-+VECT_VAR_DECL (expected_q_float, uint, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0,
-+						     0x0, 0x0, 0x0, 0x0 };
-+#endif
-+
-+/* Extra FP tests with special values (NaN, ....).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected_nan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
-+VECT_VAR_DECL (expected_mnan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_inf, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_minf, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_zero, uint, 16, 4) [] = { 0xffff, 0xffff,
-+						  0xffff, 0xffff };
-+VECT_VAR_DECL (expected_mzero, uint, 16, 4) [] = { 0xffff, 0xffff,
-+						   0xffff, 0xffff };
-+#endif
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vceqzh_f16_1.c
-@@ -0,0 +1,21 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+uint16_t expected[] = { 0xFFFF, 0xFFFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
-+			0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
-+
-+#define TEST_MSG "VCEQZH_F16"
-+#define INSN_NAME vceqzh_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcge.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcge.c
-@@ -28,6 +28,14 @@ VECT_VAR_DECL(expected_q_uint,uint,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
- 						0, 0x0, 0xffff, 0xffff };
- VECT_VAR_DECL(expected_q_uint,uint,32,4) [] = { 0x0, 0x0, 0xffffffff, 0xffffffff };
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected_float, uint, 16, 4) [] = { 0x0, 0xffff, 0xffff, 0xffff };
-+VECT_VAR_DECL (expected_q_float, uint, 16, 8) [] = { 0x0, 0x0,
-+						     0xffff, 0xffff,
-+						     0xffff, 0xffff,
-+						     0xffff, 0xffff };
-+#endif
-+
- VECT_VAR_DECL(expected_float,uint,32,2) [] = { 0x0, 0xffffffff };
- VECT_VAR_DECL(expected_q_float,uint,32,4) [] = { 0x0, 0x0, 0xffffffff, 0xffffffff };
- 
-@@ -35,6 +43,20 @@ VECT_VAR_DECL(expected_uint2,uint,32,2) [] = { 0xffffffff, 0xffffffff };
- VECT_VAR_DECL(expected_uint3,uint,32,2) [] = { 0x0, 0xffffffff };
- VECT_VAR_DECL(expected_uint4,uint,32,2) [] = { 0xffffffff, 0xffffffff };
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected_nan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
-+VECT_VAR_DECL (expected_mnan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_nan2, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+
-+VECT_VAR_DECL (expected_inf, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_minf, uint, 16, 4) [] = { 0xffff, 0xffff,
-+						  0xffff, 0xffff };
-+VECT_VAR_DECL (expected_inf2, uint, 16, 4) [] = { 0xffff, 0xffff,
-+						  0xffff, 0xffff };
-+VECT_VAR_DECL (expected_mzero, uint, 16, 4) [] = { 0xffff, 0xffff,
-+						   0xffff, 0xffff };
-+#endif
-+
- VECT_VAR_DECL(expected_nan,uint,32,2) [] = { 0x0, 0x0 };
- VECT_VAR_DECL(expected_mnan,uint,32,2) [] = { 0x0, 0x0 };
- VECT_VAR_DECL(expected_nan2,uint,32,2) [] = { 0x0, 0x0 };
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcgeh_f16_1.c
-@@ -0,0 +1,22 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+uint16_t expected[] = { 0x0, 0x0, 0xFFFF, 0x0, 0x0, 0xFFFF, 0x0, 0xFFFF,
-+			0x0, 0x0, 0xFFFF, 0x0, 0xFFFF, 0xFFFF, 0x0, 0xFFFF,
-+			0xFFFF, 0x0};
-+
-+#define TEST_MSG "VCGEH_F16"
-+#define INSN_NAME vcgeh_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "binary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcgez_1.c
-@@ -0,0 +1,30 @@
-+/* This file tests an intrinsic which currently has only an f16 variant and that
-+   is only available when FP16 arithmetic instructions are supported.  */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
-+
-+#define INSN_NAME vcgez
-+#define TEST_MSG "VCGEZ/VCGEZQ"
-+
-+#include "cmp_zero_op.inc"
-+
-+/* Expected results.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected_float, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
-+VECT_VAR_DECL (expected_q_float, uint, 16, 8) [] = { 0xffff, 0xffff,
-+						     0xffff, 0xffff,
-+						     0xffff, 0xffff,
-+						     0xffff, 0xffff };
-+#endif
-+
-+/* Extra FP tests with special values (NaN, ....).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected_nan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
-+VECT_VAR_DECL (expected_mnan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_inf, uint, 16, 4) [] = { 0xffff, 0xffff,
-+						 0xffff, 0xffff };
-+VECT_VAR_DECL (expected_minf, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_zero, uint, 16, 4) [] = { 0xffff, 0xffff,
-+						  0xffff, 0xffff };
-+VECT_VAR_DECL (expected_mzero, uint, 16, 4) [] = { 0xffff, 0xffff,
-+						   0xffff, 0xffff };
-+#endif
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcgezh_f16_1.c
-@@ -0,0 +1,22 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+uint16_t expected[] = { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0,
-+			0xFFFF, 0x0, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
-+			0x0, 0xFFFF, 0xFFFF, 0x0};
-+
-+#define TEST_MSG "VCGEZH_F16"
-+#define INSN_NAME vcgezh_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcgt.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcgt.c
-@@ -28,6 +28,14 @@ VECT_VAR_DECL(expected_q_uint,uint,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
- 						0x0, 0x0, 0x0, 0xffff };
- VECT_VAR_DECL(expected_q_uint,uint,32,4) [] = { 0x0, 0x0, 0x0, 0xffffffff };
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected_float, uint, 16, 4) [] = { 0x0, 0x0, 0xffff, 0xffff };
-+VECT_VAR_DECL (expected_q_float, uint, 16, 8) [] = { 0x0, 0x0,
-+						     0x0, 0xffff,
-+						     0xffff, 0xffff,
-+						     0xffff, 0xffff };
-+#endif
-+
- VECT_VAR_DECL(expected_float,uint,32,2) [] = { 0x0, 0x0 };
- VECT_VAR_DECL(expected_q_float,uint,32,4) [] = { 0x0, 0x0, 0x0, 0xffffffff };
- 
-@@ -35,6 +43,19 @@ VECT_VAR_DECL(expected_uint2,uint,32,2) [] = { 0x0, 0xffffffff };
- VECT_VAR_DECL(expected_uint3,uint,32,2) [] = { 0x0, 0x0 };
- VECT_VAR_DECL(expected_uint4,uint,32,2) [] = { 0x0, 0xffffffff };
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected_nan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
-+VECT_VAR_DECL (expected_mnan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_nan2, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+
-+VECT_VAR_DECL (expected_inf, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_minf, uint, 16, 4) [] = { 0xffff, 0xffff,
-+						  0xffff, 0xffff };
-+VECT_VAR_DECL (expected_inf2, uint, 16, 4) [] = { 0xffff, 0xffff,
-+						  0xffff, 0xffff };
-+VECT_VAR_DECL (expected_mzero, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+#endif
-+
- VECT_VAR_DECL(expected_nan,uint,32,2) [] = { 0x0, 0x0 };
- VECT_VAR_DECL(expected_mnan,uint,32,2) [] = { 0x0, 0x0 };
- VECT_VAR_DECL(expected_nan2,uint,32,2) [] = { 0x0, 0x0 };
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcgth_f16_1.c
-@@ -0,0 +1,22 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+uint16_t expected[] = { 0x0, 0x0, 0xFFFF, 0x0, 0x0, 0xFFFF, 0x0, 0xFFFF,
-+			0x0, 0x0, 0xFFFF, 0x0, 0xFFFF, 0xFFFF, 0x0, 0xFFFF,
-+			0xFFFF, 0x0};
-+
-+#define TEST_MSG "VCGTH_F16"
-+#define INSN_NAME vcgth_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "binary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcgtz_1.c
-@@ -0,0 +1,28 @@
-+/* This file tests an intrinsic which currently has only an f16 variant and that
-+   is only available when FP16 arithmetic instructions are supported.  */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
-+
-+#define INSN_NAME vcgtz
-+#define TEST_MSG "VCGTZ/VCGTZQ"
-+
-+#include "cmp_zero_op.inc"
-+
-+/* Expected results.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected_float, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
-+VECT_VAR_DECL (expected_q_float, uint, 16, 8) [] = { 0xffff, 0xffff,
-+						     0xffff, 0xffff,
-+						     0xffff, 0xffff,
-+						     0xffff, 0xffff };
-+#endif
-+
-+/* Extra FP tests with special values (NaN, ....).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected_nan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
-+VECT_VAR_DECL (expected_mnan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_inf, uint, 16, 4) [] = { 0xffff, 0xffff,
-+						 0xffff, 0xffff };
-+VECT_VAR_DECL (expected_minf, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_zero, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_mzero, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+#endif
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcgtzh_f16_1.c
-@@ -0,0 +1,22 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+uint16_t expected[] = { 0x0, 0x0, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0, 0xFFFF,
-+			0x0, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0,
-+			0xFFFF, 0xFFFF, 0x0};
-+
-+#define TEST_MSG "VCGTZH_F16"
-+#define INSN_NAME vcgtzh_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcle.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcle.c
-@@ -31,6 +31,14 @@ VECT_VAR_DECL(expected_q_uint,uint,16,8) [] = { 0xffff, 0xffff, 0xffff, 0xffff,
- VECT_VAR_DECL(expected_q_uint,uint,32,4) [] = { 0xffffffff, 0xffffffff,
- 						0xffffffff, 0x0 };
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected_float, uint, 16, 4) [] = { 0xffff, 0xffff, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_q_float, uint, 16, 8) [] = { 0xffff, 0xffff,
-+						     0xffff, 0x0,
-+						     0x0, 0x0,
-+						     0x0, 0x0 };
-+#endif
-+
- VECT_VAR_DECL(expected_float,uint,32,2) [] = { 0xffffffff, 0xffffffff };
- VECT_VAR_DECL(expected_q_float,uint,32,4) [] = { 0xffffffff, 0xffffffff,
- 						 0xffffffff, 0x0 };
-@@ -39,6 +47,20 @@ VECT_VAR_DECL(expected_uint2,uint,32,2) [] = { 0xffffffff, 0x0 };
- VECT_VAR_DECL(expected_uint3,uint,32,2) [] = { 0xffffffff, 0xffffffff };
- VECT_VAR_DECL(expected_uint4,uint,32,2) [] = { 0xffffffff, 0x0 };
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected_nan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
-+VECT_VAR_DECL (expected_mnan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_nan2, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+
-+VECT_VAR_DECL (expected_inf, uint, 16, 4) [] = { 0xffff, 0xffff,
-+						 0xffff, 0xffff };
-+VECT_VAR_DECL (expected_minf, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_inf2, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+
-+VECT_VAR_DECL (expected_mzero, uint, 16, 4) [] = { 0xffff, 0xffff,
-+						   0xffff, 0xffff };
-+#endif
-+
- VECT_VAR_DECL(expected_nan,uint,32,2) [] = { 0x0, 0x0 };
- VECT_VAR_DECL(expected_mnan,uint,32,2) [] = { 0x0, 0x0 };
- VECT_VAR_DECL(expected_nan2,uint,32,2) [] = { 0x0, 0x0 };
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcleh_f16_1.c
-@@ -0,0 +1,22 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+uint16_t expected[] = { 0xFFFF, 0xFFFF, 0x0, 0xFFFF, 0xFFFF, 0x0, 0xFFFF, 0x0,
-+			0xFFFF, 0xFFFF, 0x0, 0xFFFF, 0x0, 0x0, 0xFFFF, 0x0, 0x0,
-+			0xFFFF};
-+
-+#define TEST_MSG "VCLEH_F16"
-+#define INSN_NAME vcleh_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "binary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vclez_1.c
-@@ -0,0 +1,29 @@
-+/* This file tests an intrinsic which currently has only an f16 variant and that
-+   is only available when FP16 arithmetic instructions are supported.  */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
-+
-+#define INSN_NAME vclez
-+#define TEST_MSG "VCLEZ/VCLEZQ"
-+
-+#include "cmp_zero_op.inc"
-+
-+/* Expected results.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected_float, uint, 16, 4) [] = { 0xffff, 0xffff,
-+						   0xffff, 0xffff };
-+VECT_VAR_DECL (expected_q_float, uint, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0 };
-+#endif
-+
-+/* Extra FP tests with special values (NaN, ....).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected_nan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
-+VECT_VAR_DECL (expected_mnan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_inf, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+
-+VECT_VAR_DECL (expected_minf, uint, 16, 4) [] = { 0xffff, 0xffff,
-+						  0xffff, 0xffff };
-+VECT_VAR_DECL (expected_zero, uint, 16, 4) [] = { 0xffff, 0xffff,
-+						  0xffff, 0xffff };
-+VECT_VAR_DECL (expected_mzero, uint, 16, 4) [] = { 0xffff, 0xffff,
-+						   0xffff, 0xffff };
-+#endif
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vclezh_f16_1.c
-@@ -0,0 +1,21 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+uint16_t expected[] = { 0xFFFF, 0xFFFF, 0x0, 0x0, 0x0, 0x0, 0xFFFF, 0x0, 0xFFFF,
-+			0x0, 0x0, 0x0, 0x0, 0x0, 0xFFFF, 0x0, 0x0, 0xFFFF};
-+
-+#define TEST_MSG "VCLEZH_F16"
-+#define INSN_NAME vclezh_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vclt.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vclt.c
-@@ -30,6 +30,14 @@ VECT_VAR_DECL(expected_q_uint,uint,16,8) [] = { 0xffff, 0xffff, 0xffff, 0xffff,
- VECT_VAR_DECL(expected_q_uint,uint,32,4) [] = { 0xffffffff, 0xffffffff,
- 						0x0, 0x0 };
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected_float, uint, 16, 4) [] = { 0xffff, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_q_float, uint, 16, 8) [] = { 0xffff, 0xffff,
-+						     0x0, 0x0,
-+						     0x0, 0x0,
-+						     0x0, 0x0 };
-+#endif
-+
- VECT_VAR_DECL(expected_float,uint,32,2) [] = { 0xffffffff, 0x0 };
- VECT_VAR_DECL(expected_q_float,uint,32,4) [] = { 0xffffffff, 0xffffffff,
- 						 0x0, 0x0 };
-@@ -38,6 +46,19 @@ VECT_VAR_DECL(expected_uint2,uint,32,2) [] = { 0x0, 0x0 };
- VECT_VAR_DECL(expected_uint3,uint,32,2) [] = { 0xffffffff, 0x0 };
- VECT_VAR_DECL(expected_uint4,uint,32,2) [] = { 0x0, 0x0 };
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected_nan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
-+VECT_VAR_DECL (expected_mnan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_nan2, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+
-+VECT_VAR_DECL (expected_inf, uint, 16, 4) [] = { 0xffff, 0xffff,
-+						 0xffff, 0xffff };
-+VECT_VAR_DECL (expected_minf, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_inf2, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+
-+VECT_VAR_DECL (expected_mzero, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+#endif
-+
- VECT_VAR_DECL(expected_nan,uint,32,2) [] = { 0x0, 0x0 };
- VECT_VAR_DECL(expected_mnan,uint,32,2) [] = { 0x0, 0x0 };
- VECT_VAR_DECL(expected_nan2,uint,32,2) [] = { 0x0, 0x0 };
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vclth_f16_1.c
-@@ -0,0 +1,22 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+uint16_t expected[] = { 0xFFFF, 0xFFFF, 0x0, 0xFFFF, 0xFFFF, 0x0, 0xFFFF, 0x0,
-+			0xFFFF, 0xFFFF, 0x0, 0xFFFF, 0x0, 0x0, 0xFFFF, 0x0, 0x0,
-+			0xFFFF};
-+
-+#define TEST_MSG "VCLTH_F16"
-+#define INSN_NAME vclth_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "binary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcltz_1.c
-@@ -0,0 +1,27 @@
-+/* This file tests an intrinsic which currently has only an f16 variant and that
-+   is only available when FP16 arithmetic instructions are supported.  */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
-+
-+#define INSN_NAME vcltz
-+#define TEST_MSG "VCLTZ/VCLTZQ"
-+
-+#include "cmp_zero_op.inc"
-+
-+/* Expected results.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected_float, uint, 16, 4) [] = { 0xffff, 0xffff,
-+						   0xffff, 0xffff };
-+VECT_VAR_DECL (expected_q_float, uint, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0 };
-+#endif
-+
-+/* Extra FP tests with special values (NaN, ....).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected_nan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0  };
-+VECT_VAR_DECL (expected_mnan, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_inf, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+
-+VECT_VAR_DECL (expected_minf, uint, 16, 4) [] = { 0xffff, 0xffff,
-+						  0xffff, 0xffff };
-+VECT_VAR_DECL (expected_zero, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL (expected_mzero, uint, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+#endif
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcltzh_f16_1.c
-@@ -0,0 +1,21 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+uint16_t expected[] = { 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFFFF, 0x0, 0xFFFF,
-+			0x0, 0x0, 0x0, 0x0, 0x0, 0xFFFF, 0x0, 0x0, 0xFFFF};
-+
-+#define TEST_MSG "VCltZH_F16"
-+#define INSN_NAME vcltzh_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcnt.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcnt.c
-@@ -65,10 +65,10 @@ FNNAME (INSN_NAME)
- 
-   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected, "");
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
-   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected, "");
-   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected, "");
--  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
- }
- 
- int main (void)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcombine.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcombine.c
-@@ -93,8 +93,8 @@ void exec_vcombine (void)
-   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
-   CHECK(TEST_MSG, uint, 64, 2, PRIx64, expected, "");
--  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
--  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected, "");
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-   CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, "");
- #endif
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcreate.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcreate.c
-@@ -106,8 +106,8 @@ FNNAME (INSN_NAME)
-   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
-   CHECK(TEST_MSG, uint, 64, 1, PRIx64, expected, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
--  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-   CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
- #endif
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvt.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvt.c
-@@ -4,36 +4,99 @@
- #include <math.h>
- 
- /* Expected results for vcvt.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_s, hfloat, 16, 4) [] =
-+{ 0xcc00, 0xcb80, 0xcb00, 0xca80 };
-+VECT_VAR_DECL(expected_u, hfloat, 16, 4) [] =
-+{ 0x7c00, 0x7c00, 0x7c00, 0x7c00, };
-+VECT_VAR_DECL(expected_s, hfloat, 16, 8) [] =
-+{ 0xcc00, 0xcb80, 0xcb00, 0xca80,
-+  0xca00, 0xc980, 0xc900, 0xc880 };
-+VECT_VAR_DECL(expected_u, hfloat, 16, 8) [] =
-+{ 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-+  0x7c00, 0x7c00, 0x7c00, 0x7c00, };
-+#endif
- VECT_VAR_DECL(expected_s,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
- VECT_VAR_DECL(expected_u,hfloat,32,2) [] = { 0x4f800000, 0x4f800000 };
- VECT_VAR_DECL(expected_s,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
--					   0xc1600000, 0xc1500000 };
-+					     0xc1600000, 0xc1500000 };
- VECT_VAR_DECL(expected_u,hfloat,32,4) [] = { 0x4f800000, 0x4f800000,
--					   0x4f800000, 0x4f800000 };
-+					     0x4f800000, 0x4f800000 };
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, int, 16, 4) [] = { 0xfff1, 0x5, 0xfff1, 0x5 };
-+VECT_VAR_DECL(expected, uint, 16, 4) [] = { 0x0, 0x5, 0x0, 0x5 };
-+VECT_VAR_DECL(expected, int, 16, 8) [] = { 0x0, 0x0, 0xf, 0xfff1,
-+					   0x0, 0x0, 0xf, 0xfff1 };
-+VECT_VAR_DECL(expected, uint, 16, 8) [] = { 0x0, 0x0, 0xf, 0x0,
-+					    0x0, 0x0, 0xf, 0x0 };
-+#endif
- VECT_VAR_DECL(expected,int,32,2) [] = { 0xfffffff1, 0x5 };
- VECT_VAR_DECL(expected,uint,32,2) [] = { 0x0, 0x5 };
- VECT_VAR_DECL(expected,int,32,4) [] = { 0x0, 0x0, 0xf, 0xfffffff1 };
- VECT_VAR_DECL(expected,uint,32,4) [] = { 0x0, 0x0, 0xf, 0x0 };
- 
- /* Expected results for vcvt_n.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_vcvt_n_s, hfloat, 16, 4) [] = { 0xc400, 0xc380,
-+						       0xc300, 0xc280 };
-+VECT_VAR_DECL(expected_vcvt_n_u, hfloat, 16, 4) [] = { 0x6000, 0x6000,
-+						       0x6000, 0x6000 };
-+VECT_VAR_DECL(expected_vcvt_n_s, hfloat, 16, 8) [] = { 0xb000, 0xaf80,
-+						       0xaf00, 0xae80,
-+						       0xae00, 0xad80,
-+						       0xad00, 0xac80 };
-+VECT_VAR_DECL(expected_vcvt_n_u, hfloat, 16, 8) [] = { 0x4c00, 0x4c00,
-+						       0x4c00, 0x4c00,
-+						       0x4c00, 0x4c00,
-+						       0x4c00, 0x4c00 };
-+#endif
- VECT_VAR_DECL(expected_vcvt_n_s,hfloat,32,2) [] = { 0xc0800000, 0xc0700000 };
- VECT_VAR_DECL(expected_vcvt_n_u,hfloat,32,2) [] = { 0x4c000000, 0x4c000000 };
- VECT_VAR_DECL(expected_vcvt_n_s,hfloat,32,4) [] = { 0xb2800000, 0xb2700000,
- 						  0xb2600000, 0xb2500000 };
- VECT_VAR_DECL(expected_vcvt_n_u,hfloat,32,4) [] = { 0x49800000, 0x49800000,
- 						  0x49800000, 0x49800000 };
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_vcvt_n, int, 16, 4) [] = { 0xffc3, 0x15,
-+						  0xffc3, 0x15 };
-+VECT_VAR_DECL(expected_vcvt_n, uint, 16, 4) [] = { 0x0, 0x2a6, 0x0, 0x2a6 };
-+VECT_VAR_DECL(expected_vcvt_n, int, 16, 8) [] = { 0x0, 0x0, 0x78f, 0xf871,
-+						  0x0, 0x0, 0x78f, 0xf871 };
-+VECT_VAR_DECL(expected_vcvt_n, uint, 16, 8) [] = { 0x0, 0x0, 0xf1e0, 0x0,
-+						   0x0, 0x0, 0xf1e0, 0x0 };
-+#endif
- VECT_VAR_DECL(expected_vcvt_n,int,32,2) [] = { 0xff0b3333, 0x54cccd };
- VECT_VAR_DECL(expected_vcvt_n,uint,32,2) [] = { 0x0, 0x15 };
- VECT_VAR_DECL(expected_vcvt_n,int,32,4) [] = { 0x0, 0x0, 0x1e3d7, 0xfffe1c29 };
- VECT_VAR_DECL(expected_vcvt_n,uint,32,4) [] = { 0x0, 0x0, 0x1e, 0x0 };
- 
- /* Expected results for vcvt with rounding.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_rounding, int, 16, 4) [] = { 0xa, 0xa, 0xa, 0xa };
-+VECT_VAR_DECL(expected_rounding, uint, 16, 4) [] = { 0xa, 0xa, 0xa, 0xa };
-+VECT_VAR_DECL(expected_rounding, int, 16, 8) [] = { 0x7d, 0x7d, 0x7d, 0x7d,
-+						    0x7d, 0x7d, 0x7d, 0x7d };
-+VECT_VAR_DECL(expected_rounding, uint, 16, 8) [] = { 0x7d, 0x7d, 0x7d, 0x7d,
-+						     0x7d, 0x7d, 0x7d, 0x7d };
-+#endif
- VECT_VAR_DECL(expected_rounding,int,32,2) [] = { 0xa, 0xa };
- VECT_VAR_DECL(expected_rounding,uint,32,2) [] = { 0xa, 0xa };
- VECT_VAR_DECL(expected_rounding,int,32,4) [] = { 0x7d, 0x7d, 0x7d, 0x7d };
- VECT_VAR_DECL(expected_rounding,uint,32,4) [] = { 0x7d, 0x7d, 0x7d, 0x7d };
- 
- /* Expected results for vcvt_n with rounding.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_vcvt_n_rounding, int, 16, 4) [] =
-+{ 0x533, 0x533, 0x533, 0x533 };
-+VECT_VAR_DECL(expected_vcvt_n_rounding, uint, 16, 4) [] =
-+{ 0x533, 0x533, 0x533, 0x533 };
-+VECT_VAR_DECL(expected_vcvt_n_rounding, int, 16, 8) [] =
-+{ 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-+  0x7fff, 0x7fff, 0x7fff, 0x7fff };
-+VECT_VAR_DECL(expected_vcvt_n_rounding, uint, 16, 8) [] =
-+{ 0xffff, 0xffff, 0xffff, 0xffff,
-+  0xffff, 0xffff, 0xffff, 0xffff };
-+#endif
- VECT_VAR_DECL(expected_vcvt_n_rounding,int,32,2) [] = { 0xa66666, 0xa66666 };
- VECT_VAR_DECL(expected_vcvt_n_rounding,uint,32,2) [] = { 0xa66666, 0xa66666 };
- VECT_VAR_DECL(expected_vcvt_n_rounding,int,32,4) [] = { 0xfbccc, 0xfbccc,
-@@ -42,11 +105,17 @@ VECT_VAR_DECL(expected_vcvt_n_rounding,uint,32,4) [] = { 0xfbccc, 0xfbccc,
- 						0xfbccc, 0xfbccc };
- 
- /* Expected results for vcvt_n with saturation.  */
--VECT_VAR_DECL(expected_vcvt_n_saturation,int,32,2) [] = { 0x7fffffff,
--							  0x7fffffff };
--VECT_VAR_DECL(expected_vcvt_n_saturation,int,32,4) [] = { 0x7fffffff,
--							  0x7fffffff,
--					       0x7fffffff, 0x7fffffff };
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_vcvt_n_saturation, int, 16, 4) [] =
-+{ 0x533, 0x533, 0x533, 0x533 };
-+VECT_VAR_DECL(expected_vcvt_n_saturation, int, 16, 8) [] =
-+{ 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-+  0x7fff, 0x7fff, 0x7fff, 0x7fff };
-+#endif
-+VECT_VAR_DECL(expected_vcvt_n_saturation,int,32,2) [] =
-+{ 0x7fffffff, 0x7fffffff };
-+VECT_VAR_DECL(expected_vcvt_n_saturation,int,32,4) [] =
-+{ 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
- 
- #define TEST_MSG "VCVT/VCVTQ"
- void exec_vcvt (void)
-@@ -89,11 +158,26 @@ void exec_vcvt (void)
- 
-   /* Initialize input "vector" from "buffer".  */
-   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VLOAD(vector, buffer, , float, f, 16, 4);
-+  VLOAD(vector, buffer, q, float, f, 16, 8);
-+#endif
-   VLOAD(vector, buffer, , float, f, 32, 2);
-   VLOAD(vector, buffer, q, float, f, 32, 4);
- 
-   /* Make sure some elements have a fractional part, to exercise
-      integer conversions.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VSET_LANE(vector, , float, f, 16, 4, 0, -15.3f);
-+  VSET_LANE(vector, , float, f, 16, 4, 1, 5.3f);
-+  VSET_LANE(vector, , float, f, 16, 4, 2, -15.3f);
-+  VSET_LANE(vector, , float, f, 16, 4, 3, 5.3f);
-+  VSET_LANE(vector, q, float, f, 16, 8, 4, -15.3f);
-+  VSET_LANE(vector, q, float, f, 16, 8, 5, 5.3f);
-+  VSET_LANE(vector, q, float, f, 16, 8, 6, -15.3f);
-+  VSET_LANE(vector, q, float, f, 16, 8, 7, 5.3f);
-+#endif
-+
-   VSET_LANE(vector, , float, f, 32, 2, 0, -15.3f);
-   VSET_LANE(vector, , float, f, 32, 2, 1, 5.3f);
-   VSET_LANE(vector, q, float, f, 32, 4, 2, -15.3f);
-@@ -103,23 +187,55 @@ void exec_vcvt (void)
-      before overwriting them.  */
- #define TEST_MSG2 ""
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  /* vcvt_f16_xx.  */
-+  TEST_VCVT_FP(, float, f, 16, 4, int, s, expected_s);
-+  TEST_VCVT_FP(, float, f, 16, 4, uint, u, expected_u);
-+#endif
-   /* vcvt_f32_xx.  */
-   TEST_VCVT_FP(, float, f, 32, 2, int, s, expected_s);
-   TEST_VCVT_FP(, float, f, 32, 2, uint, u, expected_u);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  /* vcvtq_f16_xx.  */
-+  TEST_VCVT_FP(q, float, f, 16, 8, int, s, expected_s);
-+  TEST_VCVT_FP(q, float, f, 16, 8, uint, u, expected_u);
-+#endif
-   /* vcvtq_f32_xx.  */
-   TEST_VCVT_FP(q, float, f, 32, 4, int, s, expected_s);
-   TEST_VCVT_FP(q, float, f, 32, 4, uint, u, expected_u);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  /* vcvt_xx_f16.  */
-+  TEST_VCVT(, int, s, 16, 4, float, f, expected);
-+  TEST_VCVT(, uint, u, 16, 4, float, f, expected);
-+#endif
-   /* vcvt_xx_f32.  */
-   TEST_VCVT(, int, s, 32, 2, float, f, expected);
-   TEST_VCVT(, uint, u, 32, 2, float, f, expected);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VSET_LANE(vector, q, float, f, 16, 8, 0, 0.0f);
-+  VSET_LANE(vector, q, float, f, 16, 8, 1, -0.0f);
-+  VSET_LANE(vector, q, float, f, 16, 8, 2, 15.12f);
-+  VSET_LANE(vector, q, float, f, 16, 8, 3, -15.12f);
-+  VSET_LANE(vector, q, float, f, 16, 8, 4, 0.0f);
-+  VSET_LANE(vector, q, float, f, 16, 8, 5, -0.0f);
-+  VSET_LANE(vector, q, float, f, 16, 8, 6, 15.12f);
-+  VSET_LANE(vector, q, float, f, 16, 8, 7, -15.12f);
-+#endif
-+
-   VSET_LANE(vector, q, float, f, 32, 4, 0, 0.0f);
-   VSET_LANE(vector, q, float, f, 32, 4, 1, -0.0f);
-   VSET_LANE(vector, q, float, f, 32, 4, 2, 15.12f);
-   VSET_LANE(vector, q, float, f, 32, 4, 3, -15.12f);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  /* vcvtq_xx_f16.  */
-+  TEST_VCVT(q, int, s, 16, 8, float, f, expected);
-+  TEST_VCVT(q, uint, u, 16, 8, float, f, expected);
-+#endif
-+
-   /* vcvtq_xx_f32.  */
-   TEST_VCVT(q, int, s, 32, 4, float, f, expected);
-   TEST_VCVT(q, uint, u, 32, 4, float, f, expected);
-@@ -129,18 +245,38 @@ void exec_vcvt (void)
- #undef TEST_MSG
- #define TEST_MSG "VCVT_N/VCVTQ_N"
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  /* vcvt_n_f16_xx.  */
-+  TEST_VCVT_N_FP(, float, f, 16, 4, int, s, 2, expected_vcvt_n_s);
-+  TEST_VCVT_N_FP(, float, f, 16, 4, uint, u, 7, expected_vcvt_n_u);
-+#endif
-   /* vcvt_n_f32_xx.  */
-   TEST_VCVT_N_FP(, float, f, 32, 2, int, s, 2, expected_vcvt_n_s);
-   TEST_VCVT_N_FP(, float, f, 32, 2, uint, u, 7, expected_vcvt_n_u);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  /* vcvtq_n_f16_xx.  */
-+  TEST_VCVT_N_FP(q, float, f, 16, 8, int, s, 7, expected_vcvt_n_s);
-+  TEST_VCVT_N_FP(q, float, f, 16, 8, uint, u, 12, expected_vcvt_n_u);
-+#endif
-   /* vcvtq_n_f32_xx.  */
-   TEST_VCVT_N_FP(q, float, f, 32, 4, int, s, 30, expected_vcvt_n_s);
-   TEST_VCVT_N_FP(q, float, f, 32, 4, uint, u, 12, expected_vcvt_n_u);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  /* vcvt_n_xx_f16.  */
-+  TEST_VCVT_N(, int, s, 16, 4, float, f, 2, expected_vcvt_n);
-+  TEST_VCVT_N(, uint, u, 16, 4, float, f, 7, expected_vcvt_n);
-+#endif
-   /* vcvt_n_xx_f32.  */
-   TEST_VCVT_N(, int, s, 32, 2, float, f, 20, expected_vcvt_n);
-   TEST_VCVT_N(, uint, u, 32, 2, float, f, 2, expected_vcvt_n);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  /* vcvtq_n_xx_f16.  */
-+  TEST_VCVT_N(q, int, s, 16, 8, float, f, 7, expected_vcvt_n);
-+  TEST_VCVT_N(q, uint, u, 16, 8, float, f, 12, expected_vcvt_n);
-+#endif
-   /* vcvtq_n_xx_f32.  */
-   TEST_VCVT_N(q, int, s, 32, 4, float, f, 13, expected_vcvt_n);
-   TEST_VCVT_N(q, uint, u, 32, 4, float, f, 1, expected_vcvt_n);
-@@ -150,20 +286,49 @@ void exec_vcvt (void)
- #define TEST_MSG "VCVT/VCVTQ"
- #undef TEST_MSG2
- #define TEST_MSG2 "(check rounding)"
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, , float, f, 16, 4, 10.4f);
-+  VDUP(vector, q, float, f, 16, 8, 125.9f);
-+#endif
-   VDUP(vector, , float, f, 32, 2, 10.4f);
-   VDUP(vector, q, float, f, 32, 4, 125.9f);
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  /* vcvt_xx_f16.  */
-+  TEST_VCVT(, int, s, 16, 4, float, f, expected_rounding);
-+  TEST_VCVT(, uint, u, 16, 4, float, f, expected_rounding);
-+#endif
-   /* vcvt_xx_f32.  */
-   TEST_VCVT(, int, s, 32, 2, float, f, expected_rounding);
-   TEST_VCVT(, uint, u, 32, 2, float, f, expected_rounding);
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  /* vcvtq_xx_f16.  */
-+  TEST_VCVT(q, int, s, 16, 8, float, f, expected_rounding);
-+  TEST_VCVT(q, uint, u, 16, 8, float, f, expected_rounding);
-+#endif
-   /* vcvtq_xx_f32.  */
-   TEST_VCVT(q, int, s, 32, 4, float, f, expected_rounding);
-   TEST_VCVT(q, uint, u, 32, 4, float, f, expected_rounding);
- 
- #undef TEST_MSG
- #define TEST_MSG "VCVT_N/VCVTQ_N"
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  /* vcvt_n_xx_f16.  */
-+  TEST_VCVT_N(, int, s, 16, 4, float, f, 7, expected_vcvt_n_rounding);
-+  TEST_VCVT_N(, uint, u, 16, 4, float, f, 7, expected_vcvt_n_rounding);
-+#endif
-   /* vcvt_n_xx_f32.  */
-   TEST_VCVT_N(, int, s, 32, 2, float, f, 20, expected_vcvt_n_rounding);
-   TEST_VCVT_N(, uint, u, 32, 2, float, f, 20, expected_vcvt_n_rounding);
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  /* vcvtq_n_xx_f16.  */
-+  TEST_VCVT_N(q, int, s, 16, 8, float, f, 13, expected_vcvt_n_rounding);
-+  TEST_VCVT_N(q, uint, u, 16, 8, float, f, 13, expected_vcvt_n_rounding);
-+#endif
-   /* vcvtq_n_xx_f32.  */
-   TEST_VCVT_N(q, int, s, 32, 4, float, f, 13, expected_vcvt_n_rounding);
-   TEST_VCVT_N(q, uint, u, 32, 4, float, f, 13, expected_vcvt_n_rounding);
-@@ -172,8 +337,18 @@ void exec_vcvt (void)
- #define TEST_MSG "VCVT_N/VCVTQ_N"
- #undef TEST_MSG2
- #define TEST_MSG2 "(check saturation)"
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  /* vcvt_n_xx_f16.  */
-+  TEST_VCVT_N(, int, s, 16, 4, float, f, 7, expected_vcvt_n_saturation);
-+#endif
-   /* vcvt_n_xx_f32.  */
-   TEST_VCVT_N(, int, s, 32, 2, float, f, 31, expected_vcvt_n_saturation);
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  /* vcvtq_n_xx_f16.  */
-+  TEST_VCVT_N(q, int, s, 16, 8, float, f, 13, expected_vcvt_n_saturation);
-+#endif
-   /* vcvtq_n_xx_f32.  */
-   TEST_VCVT_N(q, int, s, 32, 4, float, f, 31, expected_vcvt_n_saturation);
- }
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtX.inc
-@@ -0,0 +1,113 @@
-+/* Template file for VCVT operator validation.
-+
-+   This file is meant to be included by the relevant test files, which
-+   have to define the intrinsic family to test.  If a given intrinsic
-+   supports variants which are not supported by all the other vcvt
-+   operators, these can be tested by providing a definition for
-+   EXTRA_TESTS.
-+
-+   This file is only used for VCVT? tests, which currently have only f16 to
-+   integer variants.  It is based on vcvt.c.  */
-+
-+#define FNNAME1(NAME) exec_ ## NAME
-+#define FNNAME(NAME) FNNAME1 (NAME)
-+
-+void FNNAME (INSN_NAME) (void)
-+{
-+  int i;
-+
-+  /* Basic test: y=vcvt(x), then store the result.  */
-+#define TEST_VCVT1(INSN, Q, T1, T2, W, N, TS1, TS2, EXP)	\
-+  VECT_VAR(vector_res, T1, W, N) =				\
-+    INSN##Q##_##T2##W##_##TS2##W(VECT_VAR(vector, TS1, W, N));	\
-+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),			\
-+		    VECT_VAR(vector_res, T1, W, N));		\
-+  CHECK(TEST_MSG, T1, W, N, PRIx##W, EXP, TEST_MSG2);
-+
-+#define TEST_VCVT(INSN, Q, T1, T2, W, N, TS1, TS2, EXP)		\
-+  TEST_VCVT1 (INSN, Q, T1, T2, W, N, TS1, TS2, EXP)
-+
-+  DECL_VARIABLE_ALL_VARIANTS(vector);
-+  DECL_VARIABLE_ALL_VARIANTS(vector_res);
-+
-+  clean_results ();
-+
-+  /* Initialize input "vector" from "buffer".  */
-+  TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VLOAD(vector, buffer, , float, f, 16, 4);
-+  VLOAD(vector, buffer, q, float, f, 16, 8);
-+#endif
-+
-+  /* Make sure some elements have a fractional part, to exercise
-+     integer conversions.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VSET_LANE(vector, , float, f, 16, 4, 0, -15.3f);
-+  VSET_LANE(vector, , float, f, 16, 4, 1, 5.3f);
-+  VSET_LANE(vector, , float, f, 16, 4, 2, -15.3f);
-+  VSET_LANE(vector, , float, f, 16, 4, 3, 5.3f);
-+  VSET_LANE(vector, q, float, f, 16, 8, 4, -15.3f);
-+  VSET_LANE(vector, q, float, f, 16, 8, 5, 5.3f);
-+  VSET_LANE(vector, q, float, f, 16, 8, 6, -15.3f);
-+  VSET_LANE(vector, q, float, f, 16, 8, 7, 5.3f);
-+#endif
-+
-+  /* The same result buffers are used multiple times, so we check them
-+     before overwriting them.  */
-+#define TEST_MSG2 ""
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  /* vcvt?_xx_f16.  */
-+  TEST_VCVT(INSN_NAME, , int, s, 16, 4, float, f, expected);
-+  TEST_VCVT(INSN_NAME, , uint, u, 16, 4, float, f, expected);
-+#endif
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VSET_LANE(vector, q, float, f, 16, 8, 0, 0.0f);
-+  VSET_LANE(vector, q, float, f, 16, 8, 1, -0.0f);
-+  VSET_LANE(vector, q, float, f, 16, 8, 2, 15.12f);
-+  VSET_LANE(vector, q, float, f, 16, 8, 3, -15.12f);
-+  VSET_LANE(vector, q, float, f, 16, 8, 4, 0.0f);
-+  VSET_LANE(vector, q, float, f, 16, 8, 5, -0.0f);
-+  VSET_LANE(vector, q, float, f, 16, 8, 6, 15.12f);
-+  VSET_LANE(vector, q, float, f, 16, 8, 7, -15.12f);
-+#endif
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  /* vcvt?q_xx_f16.  */
-+  TEST_VCVT(INSN_NAME, q, int, s, 16, 8, float, f, expected);
-+  TEST_VCVT(INSN_NAME, q, uint, u, 16, 8, float, f, expected);
-+#endif
-+
-+  /* Check rounding.  */
-+#undef TEST_MSG2
-+#define TEST_MSG2 "(check rounding)"
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, , float, f, 16, 4, 10.4f);
-+  VDUP(vector, q, float, f, 16, 8, 125.9f);
-+#endif
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  /* vcvt?_xx_f16.  */
-+  TEST_VCVT(INSN_NAME, , int, s, 16, 4, float, f, expected_rounding);
-+  TEST_VCVT(INSN_NAME, , uint, u, 16, 4, float, f, expected_rounding);
-+#endif
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  /* vcvt?q_xx_f16.  */
-+  TEST_VCVT(INSN_NAME, q, int, s, 16, 8, float, f, expected_rounding);
-+  TEST_VCVT(INSN_NAME, q, uint, u, 16, 8, float, f, expected_rounding);
-+#endif
-+
-+#ifdef EXTRA_TESTS
-+  EXTRA_TESTS();
-+#endif
-+}
-+
-+int
-+main (void)
-+{
-+  FNNAME (INSN_NAME) ();
-+  return 0;
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvta_1.c
-@@ -0,0 +1,33 @@
-+/* This file tests an intrinsic which currently has only an f16 variant and that
-+   is only available when FP16 arithmetic instructions are supported.  */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+#include <math.h>
-+
-+/* Expected results.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, int, 16, 4) [] = { 0xfff1, 0x5, 0xfff1, 0x5 };
-+VECT_VAR_DECL(expected, uint, 16, 4) [] = { 0x0, 0x5, 0x0, 0x5 };
-+VECT_VAR_DECL(expected, int, 16, 8) [] = { 0x0, 0x0, 0xf, 0xfff1,
-+					   0x0, 0x0, 0xf, 0xfff1 };
-+VECT_VAR_DECL(expected, uint, 16, 8) [] = { 0x0, 0x0, 0xf, 0x0,
-+					    0x0, 0x0, 0xf, 0x0 };
-+#endif
-+
-+/* Expected results with rounding.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_rounding, int, 16, 4) [] = { 0xa, 0xa, 0xa, 0xa };
-+VECT_VAR_DECL(expected_rounding, uint, 16, 4) [] = { 0xa, 0xa, 0xa, 0xa };
-+VECT_VAR_DECL(expected_rounding, int, 16, 8) [] = { 0x7e, 0x7e, 0x7e, 0x7e,
-+						    0x7e, 0x7e, 0x7e, 0x7e };
-+VECT_VAR_DECL(expected_rounding, uint, 16, 8) [] = { 0x7e, 0x7e, 0x7e, 0x7e,
-+						     0x7e, 0x7e, 0x7e, 0x7e };
-+#endif
-+
-+#define TEST_MSG "VCVTA/VCVTAQ"
-+#define INSN_NAME vcvta
-+
-+#include "vcvtX.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtah_s16_f16_1.c
-@@ -0,0 +1,23 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 123.9, -56.8, 0.7, 24.6, -63.5, 169.4, -4.3, 77.0 };
-+int16_t expected[] = { 124, -57, 1, 25, -64, 169, -4, 77 };
-+
-+#define TEST_MSG "VCVTAH_S16_F16"
-+#define INSN_NAME vcvtah_s16_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE int16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtah_s32_f16_1.c
-@@ -0,0 +1,53 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] =
-+{
-+  0.0, -0.0,
-+  123.4, -567.8,
-+  -34.8, 1024,
-+  663.1, 169.1,
-+  -4.8, 77.0,
-+  -144.5, -56.8,
-+
-+  (float16_t) -16, (float16_t) -15,
-+  (float16_t) -14, (float16_t) -13,
-+};
-+
-+/* Expected results (32-bit hexadecimal representation).  */
-+uint32_t expected[] =
-+{
-+  0x00000000,
-+  0x00000000,
-+  0x0000007b,
-+  0xfffffdc8,
-+  0xffffffdd,
-+  0x00000400,
-+  0x00000297,
-+  0x000000a9,
-+  0xfffffffb,
-+  0x0000004d,
-+  0xffffff6f,
-+  0xffffffc7,
-+  0xfffffff0,
-+  0xfffffff1,
-+  0xfffffff2,
-+  0xfffffff3
-+};
-+
-+#define TEST_MSG "VCVTAH_S32_F16"
-+#define INSN_NAME vcvtah_s32_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE int32_t
-+#define OUTPUT_TYPE_SIZE 32
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtah_s64_f16_1.c
-@@ -0,0 +1,23 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 123.9, -56.8, 0.7, 24.6, -63.5, 169.4, -4.3, 77.0 };
-+int64_t expected[] = { 124, -57, 1, 25, -64, 169, -4, 77 };
-+
-+#define TEST_MSG "VCVTAH_S64_F16"
-+#define INSN_NAME vcvtah_s64_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE int64_t
-+#define OUTPUT_TYPE_SIZE 64
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtah_u16_f16_1.c
-@@ -0,0 +1,23 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 123.9, 56.8, 0.7, 24.6, 63.5, 169.4, 4.3, 77.0 };
-+uint16_t expected[] = { 124, 57, 1, 25, 64, 169, 4, 77 };
-+
-+#define TEST_MSG "VCVTAH_u16_F16"
-+#define INSN_NAME vcvtah_u16_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtah_u32_f16_1.c
-@@ -0,0 +1,53 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] =
-+{
-+  0.0, -0.0,
-+  123.4, -567.8,
-+  -34.8, 1024,
-+  663.1, 169.1,
-+  -4.8, 77.0,
-+  -144.5, -56.8,
-+
-+  (float16_t) -16, (float16_t) -15,
-+  (float16_t) -14, (float16_t) -13,
-+};
-+
-+/* Expected results (32-bit hexadecimal representation).  */
-+uint32_t expected[] =
-+{
-+  0x00000000,
-+  0x00000000,
-+  0x0000007b,
-+  0x00000000,
-+  0x00000000,
-+  0x00000400,
-+  0x00000297,
-+  0x000000a9,
-+  0x00000000,
-+  0x0000004d,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000
-+};
-+
-+#define TEST_MSG "VCVTAH_U32_F16"
-+#define INSN_NAME vcvtah_u32_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint32_t
-+#define OUTPUT_TYPE_SIZE 32
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtah_u64_f16_1.c
-@@ -0,0 +1,23 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 123.9, 56.8, 0.7, 24.6, 63.5, 169.4, 4.3, 77.0 };
-+uint64_t expected[] = { 124, 57, 1, 25, 64, 169, 4, 77 };
-+
-+#define TEST_MSG "VCVTAH_u64_F16"
-+#define INSN_NAME vcvtah_u64_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint64_t
-+#define OUTPUT_TYPE_SIZE 64
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_f16_s16_1.c
-@@ -0,0 +1,25 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+int16_t input[] = { 123, -567, 0, 1024, -63, 169, -4, 77 };
-+uint16_t expected[] = { 0x57B0 /* 123.0.  */, 0xE06E /* -567.0.  */,
-+			0x0000 /* 0.0.  */, 0x6400 /* 1024.  */,
-+			0xD3E0 /* -63.  */, 0x5948 /* 169.  */,
-+			0xC400 /* -4.  */, 0x54D0 /* 77.  */ };
-+
-+#define TEST_MSG "VCVTH_F16_S16"
-+#define INSN_NAME vcvth_f16_s16
-+
-+#define EXPECTED expected
-+
-+#define INPUT input
-+#define INPUT_TYPE int16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_f16_s32_1.c
-@@ -0,0 +1,52 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+uint32_t input[] =
-+{
-+  0, -0,
-+  123, -567,
-+  -34, 1024,
-+  -63, 169,
-+  -4, 77,
-+  -144, -56,
-+  -16, -15,
-+  -14, -13,
-+};
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected[] =
-+{
-+  0x0000 /* 0.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x57b0 /* 123.000000 */,
-+  0xe06e /* -567.000000 */,
-+  0xd040 /* -34.000000 */,
-+  0x6400 /* 1024.000000 */,
-+  0xd3e0 /* -63.000000 */,
-+  0x5948 /* 169.000000 */,
-+  0xc400 /* -4.000000 */,
-+  0x54d0 /* 77.000000 */,
-+  0xd880 /* -144.000000 */,
-+  0xd300 /* -56.000000 */,
-+  0xcc00 /* -16.000000 */,
-+  0xcb80 /* -15.000000 */,
-+  0xcb00 /* -14.000000 */,
-+  0xca80 /* -13.000000 */
-+};
-+
-+#define TEST_MSG "VCVTH_F16_S32"
-+#define INSN_NAME vcvth_f16_s32
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE uint32_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_f16_s64_1.c
-@@ -0,0 +1,25 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+int64_t input[] = { 123, -567, 0, 1024, -63, 169, -4, 77 };
-+uint16_t expected[] = { 0x57B0 /* 123.0.  */, 0xE06E /* -567.0.  */,
-+			0x0000 /* 0.0.  */, 0x6400 /* 1024.  */,
-+			0xD3E0 /* -63.  */, 0x5948 /* 169.  */,
-+			0xC400 /* -4.  */, 0x54D0 /* 77.  */ };
-+
-+#define TEST_MSG "VCVTH_F16_S64"
-+#define INSN_NAME vcvth_f16_s64
-+
-+#define EXPECTED expected
-+
-+#define INPUT input
-+#define INPUT_TYPE int64_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_f16_u16_1.c
-@@ -0,0 +1,25 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+uint16_t input[] = { 123, 567, 0, 1024, 63, 169, 4, 77 };
-+uint16_t expected[] = { 0x57B0 /* 123.0.  */, 0x606E /* 567.0.  */,
-+			0x0000 /* 0.0.  */, 0x6400 /* 1024.0.  */,
-+			0x53E0 /* 63.0.  */, 0x5948 /* 169.0.  */,
-+			0x4400 /* 4.0.  */, 0x54D0 /* 77.0.  */ };
-+
-+#define TEST_MSG "VCVTH_F16_U16"
-+#define INSN_NAME vcvth_f16_u16
-+
-+#define EXPECTED expected
-+
-+#define INPUT input
-+#define INPUT_TYPE uint16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_f16_u32_1.c
-@@ -0,0 +1,52 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+int32_t input[] =
-+{
-+  0, -0,
-+  123, -567,
-+  -34, 1024,
-+  -63, 169,
-+  -4, 77,
-+  -144, -56,
-+  -16, -15,
-+  -14, -13,
-+};
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected[] =
-+{
-+  0x0000 /* 0.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x57b0 /* 123.000000 */,
-+  0x7c00 /* inf */,
-+  0x7c00 /* inf */,
-+  0x6400 /* 1024.000000 */,
-+  0x7c00 /* inf */,
-+  0x5948 /* 169.000000 */,
-+  0x7c00 /* inf */,
-+  0x54d0 /* 77.000000 */,
-+  0x7c00 /* inf */,
-+  0x7c00 /* inf */,
-+  0x7c00 /* inf */,
-+  0x7c00 /* inf */,
-+  0x7c00 /* inf */,
-+  0x7c00 /* inf */
-+};
-+
-+#define TEST_MSG "VCVTH_F16_U32"
-+#define INSN_NAME vcvth_f16_u32
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE int32_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_f16_u64_1.c
-@@ -0,0 +1,25 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+uint64_t input[] = { 123, 567, 0, 1024, 63, 169, 4, 77 };
-+uint16_t expected[] = { 0x57B0 /* 123.0.  */, 0x606E /* 567.0.  */,
-+			0x0000 /* 0.0.  */, 0x6400 /* 1024.0.  */,
-+			0x53E0 /* 63.0.  */, 0x5948 /* 169.0.  */,
-+			0x4400 /* 4.0.  */, 0x54D0 /* 77.0.  */ };
-+
-+#define TEST_MSG "VCVTH_F16_U64"
-+#define INSN_NAME vcvth_f16_u64
-+
-+#define EXPECTED expected
-+
-+#define INPUT input
-+#define INPUT_TYPE uint64_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_n_f16_s16_1.c
-@@ -0,0 +1,46 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+int16_t input[] = { 1, 10, 48, 100, -1, -10, 7, -7 };
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected_1[] = { 0x3800 /* 0.5.  */,
-+			  0x4500 /* 5.  */,
-+			  0x4E00 /* 24.  */,
-+			  0x5240 /* 50.  */,
-+			  0xB800 /* -0.5.  */,
-+			  0xC500 /* -5.  */,
-+			  0x4300 /* 3.5.  */,
-+			  0xC300 /* -3.5.  */ };
-+
-+uint16_t expected_2[] = { 0x3400 /* 0.25.  */,
-+			  0x4100 /* 2.5.  */,
-+			  0x4A00 /* 12.  */,
-+			  0x4E40 /* 25.  */,
-+			  0xB400 /* -0.25.  */,
-+			  0xC100 /* -2.5.  */,
-+			  0x3F00 /* 1.75.  */,
-+			  0xBF00 /* -1.75.  */ };
-+
-+#define TEST_MSG "VCVTH_N_F16_S16"
-+#define INSN_NAME vcvth_n_f16_s16
-+
-+#define INPUT input
-+#define EXPECTED_1 expected_1
-+#define EXPECTED_2 expected_2
-+
-+#define INPUT_TYPE int16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+#define SCALAR_OPERANDS
-+#define SCALAR_1 1
-+#define SCALAR_2 2
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_n_f16_s32_1.c
-@@ -0,0 +1,99 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+uint32_t input[] =
-+{
-+  0, -0,
-+  123, -567,
-+  -34, 1024,
-+  -63, 169,
-+  -4, 77,
-+  -144, -56,
-+  -16, -15,
-+  -14, -13,
-+};
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected_1[] =
-+{
-+  0x0000 /* 0.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x53b0 /* 61.500000 */,
-+  0xdc6e /* -283.500000 */,
-+  0xcc40 /* -17.000000 */,
-+  0x6000 /* 512.000000 */,
-+  0xcfe0 /* -31.500000 */,
-+  0x5548 /* 84.500000 */,
-+  0xc000 /* -2.000000 */,
-+  0x50d0 /* 38.500000 */,
-+  0xd480 /* -72.000000 */,
-+  0xcf00 /* -28.000000 */,
-+  0xc800 /* -8.000000 */,
-+  0xc780 /* -7.500000 */,
-+  0xc700 /* -7.000000 */,
-+  0xc680 /* -6.500000 */
-+};
-+
-+uint16_t expected_2[] =
-+{
-+  0x0000 /* 0.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x4fb0 /* 30.750000 */,
-+  0xd86e /* -141.750000 */,
-+  0xc840 /* -8.500000 */,
-+  0x5c00 /* 256.000000 */,
-+  0xcbe0 /* -15.750000 */,
-+  0x5148 /* 42.250000 */,
-+  0xbc00 /* -1.000000 */,
-+  0x4cd0 /* 19.250000 */,
-+  0xd080 /* -36.000000 */,
-+  0xcb00 /* -14.000000 */,
-+  0xc400 /* -4.000000 */,
-+  0xc380 /* -3.750000 */,
-+  0xc300 /* -3.500000 */,
-+  0xc280 /* -3.250000 */
-+};
-+
-+uint16_t expected_3[] =
-+{
-+ 0x0000 /* 0.000000 */,
-+ 0x0000 /* 0.000000 */,
-+ 0x0000 /* 0.000000 */,
-+ 0x8002 /* -0.000000 */,
-+ 0x8000 /* -0.000000 */,
-+ 0x0004 /* 0.000000 */,
-+ 0x8000 /* -0.000000 */,
-+ 0x0001 /* 0.000000 */,
-+ 0x8000 /* -0.000000 */,
-+ 0x0000 /* 0.000000 */,
-+ 0x8001 /* -0.000000 */,
-+ 0x8000 /* -0.000000 */,
-+ 0x8000 /* -0.000000 */,
-+ 0x8000 /* -0.000000 */,
-+ 0x8000 /* -0.000000 */,
-+ 0x8000 /* -0.000000 */
-+};
-+
-+#define TEST_MSG "VCVTH_N_F16_S32"
-+#define INSN_NAME vcvth_n_f16_s32
-+
-+#define INPUT input
-+#define EXPECTED_1 expected_1
-+#define EXPECTED_2 expected_2
-+#define EXPECTED_3 expected_3
-+
-+#define INPUT_TYPE int32_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+#define SCALAR_OPERANDS
-+#define SCALAR_1 1
-+#define SCALAR_2 2
-+#define SCALAR_3 32
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_n_f16_s64_1.c
-@@ -0,0 +1,46 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+int64_t input[] = { 1, 10, 48, 100, -1, -10, 7, -7 };
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected_1[] = { 0x3800 /* 0.5.  */,
-+			  0x4500 /* 5.  */,
-+			  0x4E00 /* 24.  */,
-+			  0x5240 /* 50.  */,
-+			  0xB800 /* -0.5.  */,
-+			  0xC500 /* -5.  */,
-+			  0x4300 /* 3.5.  */,
-+			  0xC300 /* -3.5.  */ };
-+
-+uint16_t expected_2[] = { 0x3400 /* 0.25.  */,
-+			  0x4100 /* 2.5.  */,
-+			  0x4A00 /* 12.  */,
-+			  0x4E40 /* 25.  */,
-+			  0xB400 /* -0.25.  */,
-+			  0xC100 /* -2.5.  */,
-+			  0x3F00 /* 1.75.  */,
-+			  0xBF00 /* -1.75.  */ };
-+
-+#define TEST_MSG "VCVTH_N_F16_S64"
-+#define INSN_NAME vcvth_n_f16_s64
-+
-+#define INPUT input
-+#define EXPECTED_1 expected_1
-+#define EXPECTED_2 expected_2
-+
-+#define INPUT_TYPE int64_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+#define SCALAR_OPERANDS
-+#define SCALAR_1 1
-+#define SCALAR_2 2
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_n_f16_u16_1.c
-@@ -0,0 +1,46 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+uint16_t input[] = { 1, 10, 48, 100, 1000, 0, 500, 9 };
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected_1[] = { 0x3800 /* 0.5.  */,
-+			  0x4500 /* 5.  */,
-+			  0x4E00 /* 24.  */,
-+			  0x5240 /* 50.  */,
-+			  0x5FD0 /* 500.  */,
-+			  0x0000 /* 0.0.  */,
-+			  0x5BD0 /* 250.  */,
-+			  0x4480 /* 4.5.  */ };
-+
-+uint16_t expected_2[] = { 0x3400 /* 0.25.  */,
-+			  0x4100 /* 2.5.  */,
-+			  0x4A00 /* 12.  */,
-+			  0x4E40 /* 25.  */,
-+			  0x5BD0 /* 250.  */,
-+			  0x0000 /* 0.0.  */,
-+			  0x57D0 /* 125.  */,
-+			  0x4080 /* 2.25.  */ };
-+
-+#define TEST_MSG "VCVTH_N_F16_U16"
-+#define INSN_NAME vcvth_n_f16_u16
-+
-+#define INPUT input
-+#define EXPECTED_1 expected_1
-+#define EXPECTED_2 expected_2
-+
-+#define INPUT_TYPE uint16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+#define SCALAR_OPERANDS
-+#define SCALAR_1 1
-+#define SCALAR_2 2
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_n_f16_u32_1.c
-@@ -0,0 +1,99 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+uint32_t input[] =
-+{
-+  0, -0,
-+  123, -567,
-+  -34, 1024,
-+  -63, 169,
-+  -4, 77,
-+  -144, -56,
-+  -16, -15,
-+  -14, -13,
-+};
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected_1[] =
-+{
-+  0x0000 /* 0.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x53b0 /* 61.500000 */,
-+  0x7c00 /* inf */,
-+  0x7c00 /* inf */,
-+  0x6000 /* 512.000000 */,
-+  0x7c00 /* inf */,
-+  0x5548 /* 84.500000 */,
-+  0x7c00 /* inf */,
-+  0x50d0 /* 38.500000 */,
-+  0x7c00 /* inf */,
-+  0x7c00 /* inf */,
-+  0x7c00 /* inf */,
-+  0x7c00 /* inf */,
-+  0x7c00 /* inf */,
-+  0x7c00 /* inf */
-+};
-+
-+uint16_t expected_2[] =
-+{
-+  0x0000 /* 0.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x4fb0 /* 30.750000 */,
-+  0x7c00 /* inf */,
-+  0x7c00 /* inf */,
-+  0x5c00 /* 256.000000 */,
-+  0x7c00 /* inf */,
-+  0x5148 /* 42.250000 */,
-+  0x7c00 /* inf */,
-+  0x4cd0 /* 19.250000 */,
-+  0x7c00 /* inf */,
-+  0x7c00 /* inf */,
-+  0x7c00 /* inf */,
-+  0x7c00 /* inf */,
-+  0x7c00 /* inf */,
-+  0x7c00 /* inf */
-+};
-+
-+uint16_t expected_3[] =
-+{
-+  0x0000 /* 0.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0x0004 /* 0.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0x0001 /* 0.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0x3c00 /* 1.000000 */
-+};
-+
-+#define TEST_MSG "VCVTH_N_F16_U32"
-+#define INSN_NAME vcvth_n_f16_u32
-+
-+#define INPUT input
-+#define EXPECTED_1 expected_1
-+#define EXPECTED_2 expected_2
-+#define EXPECTED_3 expected_3
-+
-+#define INPUT_TYPE uint32_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+#define SCALAR_OPERANDS
-+#define SCALAR_1 1
-+#define SCALAR_2 2
-+#define SCALAR_3 32
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_n_f16_u64_1.c
-@@ -0,0 +1,46 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+uint64_t input[] = { 1, 10, 48, 100, 1000, 0, 500, 9 };
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected_1[] = { 0x3800 /* 0.5.  */,
-+			  0x4500 /* 5.  */,
-+			  0x4E00 /* 24.  */,
-+			  0x5240 /* 50.  */,
-+			  0x5FD0 /* 500.  */,
-+			  0x0000 /* 0.0.  */,
-+			  0x5BD0 /* 250.  */,
-+			  0x4480 /* 4.5.  */ };
-+
-+uint16_t expected_2[] = { 0x3400 /* 0.25.  */,
-+			  0x4100 /* 2.5.  */,
-+			  0x4A00 /* 12.  */,
-+			  0x4E40 /* 25.  */,
-+			  0x5BD0 /* 250.  */,
-+			  0x0000 /* 0.0.  */,
-+			  0x57D0 /* 125.  */,
-+			  0x4080 /* 2.25.  */ };
-+
-+#define TEST_MSG "VCVTH_N_F16_U64"
-+#define INSN_NAME vcvth_n_f16_u64
-+
-+#define INPUT input
-+#define EXPECTED_1 expected_1
-+#define EXPECTED_2 expected_2
-+
-+#define INPUT_TYPE uint64_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+#define SCALAR_OPERANDS
-+#define SCALAR_1 1
-+#define SCALAR_2 2
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_n_s16_f16_1.c
-@@ -0,0 +1,29 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 2.5, 100, 7.1, -9.9, -5.0, 9.1, -4.8, 77 };
-+int16_t expected_1[] = { 5, 200, 14, -19, -10, 18, -9, 154 };
-+int16_t expected_2[] = { 10, 400, 28, -39, -20, 36, -19, 308 };
-+
-+#define TEST_MSG "VCVTH_N_S16_F16"
-+#define INSN_NAME vcvth_n_s16_f16
-+
-+#define INPUT input
-+#define EXPECTED_1 expected_1
-+#define EXPECTED_2 expected_2
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE int16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+#define SCALAR_OPERANDS
-+#define SCALAR_1 1
-+#define SCALAR_2 2
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_n_s32_f16_1.c
-@@ -0,0 +1,100 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] =
-+{
-+  0.0, -0.0,
-+  123.4, -567.8,
-+  -34.8, 1024,
-+  663.1, 169.1,
-+  -4.8, 77.0,
-+  -144.5, -56.8,
-+
-+  (float16_t) -16, (float16_t) -15,
-+  (float16_t) -14, (float16_t) -13,
-+};
-+
-+/* Expected results (32-bit hexadecimal representation).  */
-+uint32_t expected_1[] =
-+{
-+  0x00000000,
-+  0x00000000,
-+  0x000000f6,
-+  0xfffffb90,
-+  0xffffffbb,
-+  0x00000800,
-+  0x0000052e,
-+  0x00000152,
-+  0xfffffff7,
-+  0x0000009a,
-+  0xfffffedf,
-+  0xffffff8f,
-+  0xffffffe0,
-+  0xffffffe2,
-+  0xffffffe4,
-+  0xffffffe6,
-+};
-+
-+uint32_t expected_2[] =
-+{
-+  0x00000000,
-+  0x00000000,
-+  0x000001ed,
-+  0xfffff720,
-+  0xffffff75,
-+  0x00001000,
-+  0x00000a5c,
-+  0x000002a4,
-+  0xffffffed,
-+  0x00000134,
-+  0xfffffdbe,
-+  0xffffff1d,
-+  0xffffffc0,
-+  0xffffffc4,
-+  0xffffffc8,
-+  0xffffffcc,
-+};
-+
-+uint32_t expected_3[] =
-+{
-+  0x00000000,
-+  0x00000000,
-+  0x7fffffff,
-+  0x80000000,
-+  0x80000000,
-+  0x7fffffff,
-+  0x7fffffff,
-+  0x7fffffff,
-+  0x80000000,
-+  0x7fffffff,
-+  0x80000000,
-+  0x80000000,
-+  0x80000000,
-+  0x80000000,
-+  0x80000000,
-+  0x80000000,
-+};
-+
-+#define TEST_MSG "VCVTH_N_S32_F16"
-+#define INSN_NAME vcvth_n_s32_f16
-+
-+#define INPUT input
-+#define EXPECTED_1 expected_1
-+#define EXPECTED_2 expected_2
-+#define EXPECTED_3 expected_3
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint32_t
-+#define OUTPUT_TYPE_SIZE 32
-+
-+#define SCALAR_OPERANDS
-+#define SCALAR_1 1
-+#define SCALAR_2 2
-+#define SCALAR_3 32
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_n_s64_f16_1.c
-@@ -0,0 +1,29 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 2.5, 100, 7.1, -9.9, -5.0, 9.1, -4.8, 77 };
-+int64_t expected_1[] = { 5, 200, 14, -19, -10, 18, -9, 154 };
-+int64_t expected_2[] = { 10, 400, 28, -39, -20, 36, -19, 308 };
-+
-+#define TEST_MSG "VCVTH_N_S64_F16"
-+#define INSN_NAME vcvth_n_s64_f16
-+
-+#define INPUT input
-+#define EXPECTED_1 expected_1
-+#define EXPECTED_2 expected_2
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE int64_t
-+#define OUTPUT_TYPE_SIZE 64
-+
-+#define SCALAR_OPERANDS
-+#define SCALAR_1 1
-+#define SCALAR_2 2
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_n_u16_f16_1.c
-@@ -0,0 +1,29 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 2.5, 100, 7.1, 9.9, 5.0, 9.1, 4.8, 77 };
-+uint16_t expected_1[] = {5, 200, 14, 19, 10, 18, 9, 154};
-+uint16_t expected_2[] = {10, 400, 28, 39, 20, 36, 19, 308};
-+
-+#define TEST_MSG "VCVTH_N_U16_F16"
-+#define INSN_NAME vcvth_n_u16_f16
-+
-+#define INPUT input
-+#define EXPECTED_1 expected_1
-+#define EXPECTED_2 expected_2
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+#define SCALAR_OPERANDS
-+#define SCALAR_1 1
-+#define SCALAR_2 2
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_n_u32_f16_1.c
-@@ -0,0 +1,100 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] =
-+{
-+  0.0, -0.0,
-+  123.4, -567.8,
-+  -34.8, 1024,
-+  663.1, 169.1,
-+  -4.8, 77.0,
-+  -144.5, -56.8,
-+
-+  (float16_t) -16, (float16_t) -15,
-+  (float16_t) -14, (float16_t) -13,
-+};
-+
-+/* Expected results (32-bit hexadecimal representation).  */
-+uint32_t expected_1[] =
-+{
-+  0x00000000,
-+  0x00000000,
-+  0x000000f6,
-+  0x00000000,
-+  0x00000000,
-+  0x00000800,
-+  0x0000052e,
-+  0x00000152,
-+  0x00000000,
-+  0x0000009a,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+};
-+
-+uint32_t expected_2[] =
-+{
-+  0x00000000,
-+  0x00000000,
-+  0x000001ed,
-+  0x00000000,
-+  0x00000000,
-+  0x00001000,
-+  0x00000a5c,
-+  0x000002a4,
-+  0x00000000,
-+  0x00000134,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+};
-+
-+uint32_t expected_3[] =
-+{
-+  0x00000000,
-+  0x00000000,
-+  0xffffffff,
-+  0x00000000,
-+  0x00000000,
-+  0xffffffff,
-+  0xffffffff,
-+  0xffffffff,
-+  0x00000000,
-+  0xffffffff,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+};
-+
-+#define TEST_MSG "VCVTH_N_U32_F16"
-+#define INSN_NAME vcvth_n_u32_f16
-+
-+#define INPUT input
-+#define EXPECTED_1 expected_1
-+#define EXPECTED_2 expected_2
-+#define EXPECTED_3 expected_3
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint32_t
-+#define OUTPUT_TYPE_SIZE 32
-+
-+#define SCALAR_OPERANDS
-+#define SCALAR_1 1
-+#define SCALAR_2 2
-+#define SCALAR_3 32
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_n_u64_f16_1.c
-@@ -0,0 +1,29 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 2.5, 100, 7.1, 9.9, 5.0, 9.1, 4.8, 77 };
-+uint64_t expected_1[] = { 5, 200, 14, 19, 10, 18, 9, 154 };
-+uint64_t expected_2[] = { 10, 400, 28, 39, 20, 36, 19, 308 };
-+
-+#define TEST_MSG "VCVTH_N_U64_F16"
-+#define INSN_NAME vcvth_n_u64_f16
-+
-+#define INPUT input
-+#define EXPECTED_1 expected_1
-+#define EXPECTED_2 expected_2
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint64_t
-+#define OUTPUT_TYPE_SIZE 64
-+
-+#define SCALAR_OPERANDS
-+#define SCALAR_1 1
-+#define SCALAR_2 2
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_s16_f16_1.c
-@@ -0,0 +1,23 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 123.9, -56.8, 0.7, 24.6, -63.5, 169.4, -4.3, 77.0 };
-+int16_t expected[] = { 123, -56, 0, 24, -63, 169, -4, 77 };
-+
-+#define TEST_MSG "VCVTH_S16_F16"
-+#define INSN_NAME vcvth_s16_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE int16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_s32_f16_1.c
-@@ -0,0 +1,53 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] =
-+{
-+  0.0, -0.0,
-+  123.4, -567.8,
-+  -34.8, 1024,
-+  663.1, 169.1,
-+  -4.8, 77.0,
-+  -144.5, -56.8,
-+
-+  (float16_t) -16, (float16_t) -15,
-+  (float16_t) -14, (float16_t) -13,
-+};
-+
-+/* Expected results (32-bit hexadecimal representation).  */
-+uint32_t expected[] =
-+{
-+  0x00000000,
-+  0x00000000,
-+  0x0000007b,
-+  0xfffffdc8,
-+  0xffffffde,
-+  0x00000400,
-+  0x00000297,
-+  0x000000a9,
-+  0xfffffffc,
-+  0x0000004d,
-+  0xffffff70,
-+  0xffffffc8,
-+  0xfffffff0,
-+  0xfffffff1,
-+  0xfffffff2,
-+  0xfffffff3,
-+};
-+
-+#define TEST_MSG "VCVTH_S32_F16"
-+#define INSN_NAME vcvth_s32_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE int32_t
-+#define OUTPUT_TYPE_SIZE 32
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_s64_f16_1.c
-@@ -0,0 +1,23 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 123.9, -56.8, 0.7, 24.6, -63.5, 169.4, -4.3, 77.0 };
-+int64_t expected[] = { 123, -56, 0, 24, -63, 169, -4, 77 };
-+
-+#define TEST_MSG "VCVTH_S64_F16"
-+#define INSN_NAME vcvth_s64_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE int64_t
-+#define OUTPUT_TYPE_SIZE 64
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_u16_f16_1.c
-@@ -0,0 +1,23 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 123.9, 56.8, 0.7, 24.6, 63.5, 169.4, 4.3, 77.0 };
-+uint16_t expected[] = { 123, 56, 0, 24, 63, 169, 4, 77 };
-+
-+#define TEST_MSG "VCVTH_u16_F16"
-+#define INSN_NAME vcvth_u16_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_u32_f16_1.c
-@@ -0,0 +1,53 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] =
-+{
-+  0.0, -0.0,
-+  123.4, -567.8,
-+  -34.8, 1024,
-+  663.1, 169.1,
-+  -4.8, 77.0,
-+  -144.5, -56.8,
-+
-+  (float16_t) -16, (float16_t) -15,
-+  (float16_t) -14, (float16_t) -13,
-+};
-+
-+/* Expected results (32-bit hexadecimal representation).  */
-+uint32_t expected[] =
-+{
-+  0x00000000,
-+  0x00000000,
-+  0x0000007b,
-+  0x00000000,
-+  0x00000000,
-+  0x00000400,
-+  0x00000297,
-+  0x000000a9,
-+  0x00000000,
-+  0x0000004d,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+};
-+
-+#define TEST_MSG "VCVTH_U32_F16"
-+#define INSN_NAME vcvth_u32_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint32_t
-+#define OUTPUT_TYPE_SIZE 32
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvth_u64_f16_1.c
-@@ -0,0 +1,23 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 123.9, 56.8, 0.7, 24.6, 63.5, 169.4, 4.3, 77.0 };
-+uint64_t expected[] = { 123, 56, 0, 24, 63, 169, 4, 77 };
-+
-+#define TEST_MSG "VCVTH_u64_F16"
-+#define INSN_NAME vcvth_u64_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint64_t
-+#define OUTPUT_TYPE_SIZE 64
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtm_1.c
-@@ -0,0 +1,33 @@
-+/* This file tests an intrinsic which currently has only an f16 variant and that
-+   is only available when FP16 arithmetic instructions are supported.  */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+#include <math.h>
-+
-+/* Expected results.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, int, 16, 4) [] = { 0xfff0, 0x5, 0xfff0, 0x5 };
-+VECT_VAR_DECL(expected, uint, 16, 4) [] = { 0x0, 0x5, 0x0, 0x5 };
-+VECT_VAR_DECL(expected, int, 16, 8) [] = { 0x0, 0x0, 0xf, 0xfff0, 0x0,
-+					   0x0, 0xf, 0xfff0 };
-+VECT_VAR_DECL(expected, uint, 16, 8) [] = { 0x0, 0x0, 0xf, 0x0,
-+					    0x0, 0x0, 0xf, 0x0 };
-+#endif
-+
-+/* Expected results with rounding.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_rounding, int, 16, 4) [] = { 0xa, 0xa, 0xa, 0xa };
-+VECT_VAR_DECL(expected_rounding, uint, 16, 4) [] = { 0xa, 0xa, 0xa, 0xa };
-+VECT_VAR_DECL(expected_rounding, int, 16, 8) [] = { 0x7d, 0x7d, 0x7d, 0x7d,
-+						    0x7d, 0x7d, 0x7d, 0x7d };
-+VECT_VAR_DECL(expected_rounding, uint, 16, 8) [] = { 0x7d, 0x7d, 0x7d, 0x7d,
-+						     0x7d, 0x7d, 0x7d, 0x7d };
-+#endif
-+
-+#define TEST_MSG "VCVTM/VCVTMQ"
-+#define INSN_NAME vcvtm
-+
-+#include "vcvtX.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtmh_s16_f16_1.c
-@@ -0,0 +1,23 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 123.9, -56.8, 0.7, 24.6, -63.5, 169.4, -4.3, 77.0 };
-+int16_t expected[] = { 123, -57, 0, 24, -64, 169, -5, 77 };
-+
-+#define TEST_MSG "VCVTMH_S16_F16"
-+#define INSN_NAME vcvtmh_s16_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE int16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtmh_s32_f16_1.c
-@@ -0,0 +1,53 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] =
-+{
-+  0.0, -0.0,
-+  123.4, -567.8,
-+  -34.8, 1024,
-+  663.1, 169.1,
-+  -4.8, 77.0,
-+  -144.5, -56.8,
-+
-+  (float16_t) -16, (float16_t) -15,
-+  (float16_t) -14, (float16_t) -13,
-+};
-+
-+/* Expected results (32-bit hexadecimal representation).  */
-+uint32_t expected[] =
-+{
-+  0x00000000,
-+  0x00000000,
-+  0x0000007b,
-+  0xfffffdc8,
-+  0xffffffdd,
-+  0x00000400,
-+  0x00000297,
-+  0x000000a9,
-+  0xfffffffb,
-+  0x0000004d,
-+  0xffffff6f,
-+  0xffffffc7,
-+  0xfffffff0,
-+  0xfffffff1,
-+  0xfffffff2,
-+  0xfffffff3
-+};
-+
-+#define TEST_MSG "VCVTMH_S32_F16"
-+#define INSN_NAME vcvtmh_s32_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE int32_t
-+#define OUTPUT_TYPE_SIZE 32
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtmh_s64_f16_1.c
-@@ -0,0 +1,23 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 123.9, -56.8, 0.7, 24.6, -63.5, 169.4, -4.3, 77.0 };
-+int64_t expected[] = { 123, -57, 0, 24, -64, 169, -5, 77 };
-+
-+#define TEST_MSG "VCVTMH_S64_F16"
-+#define INSN_NAME vcvtmh_s64_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE int64_t
-+#define OUTPUT_TYPE_SIZE 64
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtmh_u16_f16_1.c
-@@ -0,0 +1,23 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 123.9, 56.8, 0.7, 24.6, 63.5, 169.4, 4.3, 77.0 };
-+uint16_t expected[] = { 123, 56, 0, 24, 63, 169, 4, 77 };
-+
-+#define TEST_MSG "VCVTMH_u16_F16"
-+#define INSN_NAME vcvtmh_u16_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtmh_u32_f16_1.c
-@@ -0,0 +1,53 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] =
-+{
-+  0.0, -0.0,
-+  123.4, -567.8,
-+  -34.8, 1024,
-+  663.1, 169.1,
-+  -4.8, 77.0,
-+  -144.5, -56.8,
-+
-+  (float16_t) -16, (float16_t) -15,
-+  (float16_t) -14, (float16_t) -13,
-+};
-+
-+/* Expected results (32-bit hexadecimal representation).  */
-+uint32_t expected[] =
-+{
-+  0x00000000,
-+  0x00000000,
-+  0x0000007b,
-+  0x00000000,
-+  0x00000000,
-+  0x00000400,
-+  0x00000297,
-+  0x000000a9,
-+  0x00000000,
-+  0x0000004d,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+};
-+
-+#define TEST_MSG "VCVTMH_U32_F16"
-+#define INSN_NAME vcvtmh_u32_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint32_t
-+#define OUTPUT_TYPE_SIZE 32
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtmh_u64_f16_1.c
-@@ -0,0 +1,23 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 123.9, 56.8, 0.7, 24.6, 63.5, 169.4, 4.3, 77.0 };
-+uint64_t expected[] = { 123, 56, 0, 24, 63, 169, 4, 77 };
-+
-+#define TEST_MSG "VCVTMH_u64_F16"
-+#define INSN_NAME vcvtmh_u64_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint64_t
-+#define OUTPUT_TYPE_SIZE 64
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtnh_s16_f16_1.c
-@@ -0,0 +1,23 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 123.9, -56.8, 0.7, 24.6, -63.5, 169.4, -4.3, 77.0 };
-+int16_t expected[] = { 124, -57, 1, 25, -64, 169, -4, 77 };
-+
-+#define TEST_MSG "VCVTNH_S16_F16"
-+#define INSN_NAME vcvtnh_s16_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE int16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtnh_s32_f16_1.c
-@@ -0,0 +1,53 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] =
-+{
-+  0.0, -0.0,
-+  123.4, -567.8,
-+  -34.8, 1024,
-+  663.1, 169.1,
-+  -4.8, 77.0,
-+  -144.5, -56.8,
-+
-+  (float16_t) -16, (float16_t) -15,
-+  (float16_t) -14, (float16_t) -13,
-+};
-+
-+/* Expected results (32-bit hexadecimal representation).  */
-+uint32_t expected[] =
-+{
-+  0x00000000,
-+  0x00000000,
-+  0x0000007b,
-+  0xfffffdc8,
-+  0xffffffdd,
-+  0x00000400,
-+  0x00000297,
-+  0x000000a9,
-+  0xfffffffb,
-+  0x0000004d,
-+  0xffffff70,
-+  0xffffffc7,
-+  0xfffffff0,
-+  0xfffffff1,
-+  0xfffffff2,
-+  0xfffffff3
-+};
-+
-+#define TEST_MSG "VCVTNH_S32_F16"
-+#define INSN_NAME vcvtnh_s32_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE int32_t
-+#define OUTPUT_TYPE_SIZE 32
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtnh_s64_f16_1.c
-@@ -0,0 +1,23 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 123.9, -56.8, 0.7, 24.6, -63.5, 169.4, -4.3, 77.0 };
-+int64_t expected[] = { 124, -57, 1, 25, -64, 169, -4, 77 };
-+
-+#define TEST_MSG "VCVTNH_S64_F16"
-+#define INSN_NAME vcvtnh_s64_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE int64_t
-+#define OUTPUT_TYPE_SIZE 64
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtnh_u16_f16_1.c
-@@ -0,0 +1,23 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 123.9, 56.8, 0.7, 24.6, 63.5, 169.4, 4.3, 77.0 };
-+uint16_t expected[] = { 124, 57, 1, 25, 64, 169, 4, 77 };
-+
-+#define TEST_MSG "VCVTNH_u16_F16"
-+#define INSN_NAME vcvtnh_u16_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtnh_u32_f16_1.c
-@@ -0,0 +1,53 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] =
-+{
-+  0.0, -0.0,
-+  123.4, -567.8,
-+  -34.8, 1024,
-+  663.1, 169.1,
-+  -4.8, 77.0,
-+  -144.5, -56.8,
-+
-+  (float16_t) -16, (float16_t) -15,
-+  (float16_t) -14, (float16_t) -13,
-+};
-+
-+/* Expected results (32-bit hexadecimal representation).  */
-+uint32_t expected[] =
-+{
-+  0x00000000,
-+  0x00000000,
-+  0x0000007b,
-+  0x00000000,
-+  0x00000000,
-+  0x00000400,
-+  0x00000297,
-+  0x000000a9,
-+  0x00000000,
-+  0x0000004d,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+};
-+
-+#define TEST_MSG "VCVTNH_U32_F16"
-+#define INSN_NAME vcvtnh_u32_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint32_t
-+#define OUTPUT_TYPE_SIZE 32
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtnh_u64_f16_1.c
-@@ -0,0 +1,23 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 123.9, 56.8, 0.7, 24.6, 63.5, 169.4, 4.3, 77.0 };
-+uint64_t expected[] = { 124, 57, 1, 25, 64, 169, 4, 77 };
-+
-+#define TEST_MSG "VCVTNH_u64_F16"
-+#define INSN_NAME vcvtnh_u64_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint64_t
-+#define OUTPUT_TYPE_SIZE 64
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtp_1.c
-@@ -0,0 +1,33 @@
-+/* This file tests an intrinsic which currently has only an f16 variant and that
-+   is only available when FP16 arithmetic instructions are supported.  */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+#include <math.h>
-+
-+/* Expected results.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, int, 16, 4) [] = { 0xfff1, 0x6, 0xfff1, 0x6 };
-+VECT_VAR_DECL(expected, uint, 16, 4) [] = { 0x0, 0x6, 0x0, 0x6 };
-+VECT_VAR_DECL(expected, int, 16, 8) [] = { 0x0, 0x0, 0x10, 0xfff1,
-+					   0x0, 0x0, 0x10, 0xfff1 };
-+VECT_VAR_DECL(expected, uint, 16, 8) [] = { 0x0, 0x0, 0x10, 0x0,
-+					    0x0, 0x0, 0x10, 0x0 };
-+#endif
-+
-+/* Expected results with rounding.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_rounding, int, 16, 4) [] = { 0xb, 0xb, 0xb, 0xb };
-+VECT_VAR_DECL(expected_rounding, uint, 16, 4) [] = { 0xb, 0xb, 0xb, 0xb };
-+VECT_VAR_DECL(expected_rounding, int, 16, 8) [] = { 0x7e, 0x7e, 0x7e, 0x7e,
-+						    0x7e, 0x7e, 0x7e, 0x7e };
-+VECT_VAR_DECL(expected_rounding, uint, 16, 8) [] = { 0x7e, 0x7e, 0x7e, 0x7e,
-+						     0x7e, 0x7e, 0x7e, 0x7e };
-+#endif
-+
-+#define TEST_MSG "VCVTP/VCVTPQ"
-+#define INSN_NAME vcvtp
-+
-+#include "vcvtX.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtph_s16_f16_1.c
-@@ -0,0 +1,23 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 123.9, -56.8, 0.7, 24.6, -63.5, 169.4, -4.3, 77.0 };
-+int16_t expected[] = { 124, -56, 1, 25, -63, 170, -4, 77 };
-+
-+#define TEST_MSG "VCVTPH_S16_F16"
-+#define INSN_NAME vcvtph_s16_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE int16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtph_s32_f16_1.c
-@@ -0,0 +1,53 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] =
-+{
-+  0.0, -0.0,
-+  123.4, -567.8,
-+  -34.8, 1024,
-+  663.1, 169.1,
-+  -4.8, 77.0,
-+  -144.5, -56.8,
-+
-+  (float16_t) -16, (float16_t) -15,
-+  (float16_t) -14, (float16_t) -13,
-+};
-+
-+/* Expected results (32-bit hexadecimal representation).  */
-+uint32_t expected[] =
-+{
-+  0x00000000,
-+  0x00000000,
-+  0x0000007c,
-+  0xfffffdc8,
-+  0xffffffde,
-+  0x00000400,
-+  0x00000297,
-+  0x000000aa,
-+  0xfffffffc,
-+  0x0000004d,
-+  0xffffff70,
-+  0xffffffc8,
-+  0xfffffff0,
-+  0xfffffff1,
-+  0xfffffff2,
-+  0xfffffff3
-+};
-+
-+#define TEST_MSG "VCVTPH_S32_F16"
-+#define INSN_NAME vcvtph_s32_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE int32_t
-+#define OUTPUT_TYPE_SIZE 32
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtph_s64_f16_1.c
-@@ -0,0 +1,23 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 123.9, -56.8, 0.7, 24.6, -63.5, 169.4, -4.3, 77.0 };
-+int64_t expected[] = { 124, -56, 1, 25, -63, 170, -4, 77 };
-+
-+#define TEST_MSG "VCVTPH_S64_F16"
-+#define INSN_NAME vcvtph_s64_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE int64_t
-+#define OUTPUT_TYPE_SIZE 64
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtph_u16_f16_1.c
-@@ -0,0 +1,23 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 123.9, 56.8, 0.7, 24.6, 63.5, 169.4, 4.3, 77.0 };
-+uint16_t expected[] = { 124, 57, 1, 25, 64, 170, 5, 77 };
-+
-+#define TEST_MSG "VCVTPH_u16_F16"
-+#define INSN_NAME vcvtph_u16_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtph_u32_f16_1.c
-@@ -0,0 +1,53 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] =
-+{
-+  0.0, -0.0,
-+  123.4, -567.8,
-+  -34.8, 1024,
-+  663.1, 169.1,
-+  -4.8, 77.0,
-+  -144.5, -56.8,
-+
-+  (float16_t) -16, (float16_t) -15,
-+  (float16_t) -14, (float16_t) -13,
-+};
-+
-+/* Expected results (32-bit hexadecimal representation).  */
-+uint32_t expected[] =
-+{
-+  0x00000000,
-+  0x00000000,
-+  0x0000007c,
-+  0x00000000,
-+  0x00000000,
-+  0x00000400,
-+  0x00000297,
-+  0x000000aa,
-+  0x00000000,
-+  0x0000004d,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+  0x00000000,
-+};
-+
-+#define TEST_MSG "VCVTPH_U32_F16"
-+#define INSN_NAME vcvtph_u32_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint32_t
-+#define OUTPUT_TYPE_SIZE 32
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtph_u64_f16_1.c
-@@ -0,0 +1,23 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 123.9, 56.8, 0.7, 24.6, 63.5, 169.4, 4.3, 77.0 };
-+uint64_t expected[] = { 124, 57, 1, 25, 64, 170, 5, 77 };
-+
-+#define TEST_MSG "VCVTPH_u64_F16"
-+#define INSN_NAME vcvtph_u64_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE uint64_t
-+#define OUTPUT_TYPE_SIZE 64
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdiv_f16_1.c
-@@ -0,0 +1,86 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_neon } */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+#define FP16_C(a) ((__fp16) a)
-+#define A FP16_C (13.4)
-+#define B FP16_C (-56.8)
-+#define C FP16_C (-34.8)
-+#define D FP16_C (12)
-+#define E FP16_C (63.1)
-+#define F FP16_C (19.1)
-+#define G FP16_C (-4.8)
-+#define H FP16_C (77)
-+
-+#define I FP16_C (0.7)
-+#define J FP16_C (-78)
-+#define K FP16_C (11.23)
-+#define L FP16_C (98)
-+#define M FP16_C (87.1)
-+#define N FP16_C (-8)
-+#define O FP16_C (-1.1)
-+#define P FP16_C (-9.7)
-+
-+/* Expected results for vdiv.  */
-+VECT_VAR_DECL (expected_div_static, hfloat, 16, 4) []
-+  = { 0x32CC /* A / E.  */, 0xC1F3 /* B / F.  */,
-+      0x4740 /* C / G.  */, 0x30FD /* D / H.  */ };
-+
-+VECT_VAR_DECL (expected_div_static, hfloat, 16, 8) []
-+  = { 0x32CC /* A / E.  */, 0xC1F3 /* B / F.  */,
-+      0x4740 /* C / G.  */, 0x30FD /* D / H.  */,
-+      0x201D /* I / M.  */, 0x48E0 /* J / N.  */,
-+      0xC91B /* K / O.  */, 0xC90D /* L / P.  */ };
-+
-+void exec_vdiv_f16 (void)
-+{
-+#undef TEST_MSG
-+#define TEST_MSG "VDIV (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc_1, float, 16, 4);
-+  DECL_VARIABLE(vsrc_2, float, 16, 4);
-+  VECT_VAR_DECL (buf_src_1, float, 16, 4) [] = {A, B, C, D};
-+  VECT_VAR_DECL (buf_src_2, float, 16, 4) [] = {E, F, G, H};
-+  VLOAD (vsrc_1, buf_src_1, , float, f, 16, 4);
-+  VLOAD (vsrc_2, buf_src_2, , float, f, 16, 4);
-+
-+  DECL_VARIABLE (vector_res, float, 16, 4)
-+    = vdiv_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		VECT_VAR (vsrc_2, float, 16, 4));
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_div_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VDIVQ (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc_1, float, 16, 8);
-+  DECL_VARIABLE(vsrc_2, float, 16, 8);
-+  VECT_VAR_DECL (buf_src_1, float, 16, 8) [] = {A, B, C, D, I, J, K, L};
-+  VECT_VAR_DECL (buf_src_2, float, 16, 8) [] = {E, F, G, H, M, N, O, P};
-+  VLOAD (vsrc_1, buf_src_1, q, float, f, 16, 8);
-+  VLOAD (vsrc_2, buf_src_2, q, float, f, 16, 8);
-+
-+  DECL_VARIABLE (vector_res, float, 16, 8)
-+    = vdivq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		 VECT_VAR (vsrc_2, float, 16, 8));
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_div_static, "");
-+}
-+
-+int
-+main (void)
-+{
-+  exec_vdiv_f16 ();
-+  return 0;
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdivh_f16_1.c
-@@ -0,0 +1,42 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+#define INFF __builtin_inf ()
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected[] =
-+{
-+  0x0000 /* 0.000000 */,
-+  0x8000 /* -0.000000 */,
-+  0xb765 /* -0.462158 */,
-+  0x27ef /* 0.030991 */,
-+  0x3955 /* 0.666504 */,
-+  0xccff /* -19.984375 */,
-+  0xc49a /* -4.601562 */,
-+  0xb1e3 /* -0.183960 */,
-+  0x3cd3 /* 1.206055 */,
-+  0x23f0 /* 0.015503 */,
-+  0xa9ef /* -0.046356 */,
-+  0x32f4 /* 0.217285 */,
-+  0xb036 /* -0.131592 */,
-+  0x4126 /* 2.574219 */,
-+  0xcd15 /* -20.328125 */,
-+  0x537f /* 59.968750 */,
-+  0x7e00 /* nan */,
-+  0x7e00 /* nan */
-+};
-+
-+#define TEST_MSG "VDIVH_F16"
-+#define INSN_NAME vdivh_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "binary_scalar_op.inc"
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c
-@@ -19,6 +19,10 @@ VECT_VAR_DECL(expected0,uint,64,1) [] = { 0xfffffffffffffff0 };
- VECT_VAR_DECL(expected0,poly,8,8) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
- 					 0xf0, 0xf0, 0xf0, 0xf0 };
- VECT_VAR_DECL(expected0,poly,16,4) [] = { 0xfff0, 0xfff0, 0xfff0, 0xfff0 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected0, hfloat, 16, 4) [] = { 0xcc00, 0xcc00,
-+						0xcc00, 0xcc00 };
-+#endif
- VECT_VAR_DECL(expected0,hfloat,32,2) [] = { 0xc1800000, 0xc1800000 };
- VECT_VAR_DECL(expected0,int,8,16) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
- 					 0xf0, 0xf0, 0xf0, 0xf0,
-@@ -46,6 +50,12 @@ VECT_VAR_DECL(expected0,poly,8,16) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
- 					  0xf0, 0xf0, 0xf0, 0xf0 };
- VECT_VAR_DECL(expected0,poly,16,8) [] = { 0xfff0, 0xfff0, 0xfff0, 0xfff0,
- 					  0xfff0, 0xfff0, 0xfff0, 0xfff0 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected0, hfloat, 16, 8) [] = { 0xcc00, 0xcc00,
-+						0xcc00, 0xcc00,
-+						0xcc00, 0xcc00,
-+						0xcc00, 0xcc00 };
-+#endif
- VECT_VAR_DECL(expected0,hfloat,32,4) [] = { 0xc1800000, 0xc1800000,
- 					    0xc1800000, 0xc1800000 };
- 
-@@ -63,6 +73,10 @@ VECT_VAR_DECL(expected1,uint,64,1) [] = { 0xfffffffffffffff1 };
- VECT_VAR_DECL(expected1,poly,8,8) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
- 					 0xf1, 0xf1, 0xf1, 0xf1 };
- VECT_VAR_DECL(expected1,poly,16,4) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected1, hfloat, 16, 4) [] = { 0xcb80, 0xcb80,
-+						0xcb80, 0xcb80 };
-+#endif
- VECT_VAR_DECL(expected1,hfloat,32,2) [] = { 0xc1700000, 0xc1700000 };
- VECT_VAR_DECL(expected1,int,8,16) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
- 					 0xf1, 0xf1, 0xf1, 0xf1,
-@@ -90,6 +104,12 @@ VECT_VAR_DECL(expected1,poly,8,16) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
- 					  0xf1, 0xf1, 0xf1, 0xf1 };
- VECT_VAR_DECL(expected1,poly,16,8) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1,
- 					  0xfff1, 0xfff1, 0xfff1, 0xfff1 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected1, hfloat, 16, 8) [] = { 0xcb80, 0xcb80,
-+						0xcb80, 0xcb80,
-+						0xcb80, 0xcb80,
-+						0xcb80, 0xcb80 };
-+#endif
- VECT_VAR_DECL(expected1,hfloat,32,4) [] = { 0xc1700000, 0xc1700000,
- 					    0xc1700000, 0xc1700000 };
- 
-@@ -107,6 +127,10 @@ VECT_VAR_DECL(expected2,uint,64,1) [] = { 0xfffffffffffffff2 };
- VECT_VAR_DECL(expected2,poly,8,8) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
- 					 0xf2, 0xf2, 0xf2, 0xf2 };
- VECT_VAR_DECL(expected2,poly,16,4) [] = { 0xfff2, 0xfff2, 0xfff2, 0xfff2 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected2, hfloat, 16, 4) [] = { 0xcb00, 0xcb00,
-+						0xcb00, 0xcb00 };
-+#endif
- VECT_VAR_DECL(expected2,hfloat,32,2) [] = { 0xc1600000, 0xc1600000 };
- VECT_VAR_DECL(expected2,int,8,16) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
- 					 0xf2, 0xf2, 0xf2, 0xf2,
-@@ -134,6 +158,12 @@ VECT_VAR_DECL(expected2,poly,8,16) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
- 					  0xf2, 0xf2, 0xf2, 0xf2 };
- VECT_VAR_DECL(expected2,poly,16,8) [] = { 0xfff2, 0xfff2, 0xfff2, 0xfff2,
- 					  0xfff2, 0xfff2, 0xfff2, 0xfff2 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected2, hfloat, 16, 8) [] = { 0xcb00, 0xcb00,
-+						0xcb00, 0xcb00,
-+						0xcb00, 0xcb00,
-+						0xcb00, 0xcb00 };
-+#endif
- VECT_VAR_DECL(expected2,hfloat,32,4) [] = { 0xc1600000, 0xc1600000,
- 					    0xc1600000, 0xc1600000 };
- 
-@@ -171,6 +201,9 @@ void exec_vdup_vmov (void)
-     TEST_VDUP(, uint, u, 64, 1);
-     TEST_VDUP(, poly, p, 8, 8);
-     TEST_VDUP(, poly, p, 16, 4);
-+#if defined (FP16_SUPPORTED)
-+    TEST_VDUP(, float, f, 16, 4);
-+#endif
-     TEST_VDUP(, float, f, 32, 2);
- 
-     TEST_VDUP(q, int, s, 8, 16);
-@@ -183,8 +216,26 @@ void exec_vdup_vmov (void)
-     TEST_VDUP(q, uint, u, 64, 2);
-     TEST_VDUP(q, poly, p, 8, 16);
-     TEST_VDUP(q, poly, p, 16, 8);
-+#if defined (FP16_SUPPORTED)
-+    TEST_VDUP(q, float, f, 16, 8);
-+#endif
-     TEST_VDUP(q, float, f, 32, 4);
- 
-+#if defined (FP16_SUPPORTED)
-+    switch (i) {
-+    case 0:
-+      CHECK_RESULTS_NAMED (TEST_MSG, expected0, "");
-+      break;
-+    case 1:
-+      CHECK_RESULTS_NAMED (TEST_MSG, expected1, "");
-+      break;
-+    case 2:
-+      CHECK_RESULTS_NAMED (TEST_MSG, expected2, "");
-+      break;
-+    default:
-+      abort();
-+    }
-+#else
-     switch (i) {
-     case 0:
-       CHECK_RESULTS_NAMED_NO_FP16 (TEST_MSG, expected0, "");
-@@ -198,6 +249,7 @@ void exec_vdup_vmov (void)
-     default:
-       abort();
-     }
-+#endif
-   }
- 
-   /* Do the same tests with vmov. Use the same expected results.  */
-@@ -216,6 +268,9 @@ void exec_vdup_vmov (void)
-     TEST_VMOV(, uint, u, 64, 1);
-     TEST_VMOV(, poly, p, 8, 8);
-     TEST_VMOV(, poly, p, 16, 4);
-+#if defined (FP16_SUPPORTED)
-+    TEST_VMOV(, float, f, 16, 4);
-+#endif
-     TEST_VMOV(, float, f, 32, 2);
- 
-     TEST_VMOV(q, int, s, 8, 16);
-@@ -228,8 +283,26 @@ void exec_vdup_vmov (void)
-     TEST_VMOV(q, uint, u, 64, 2);
-     TEST_VMOV(q, poly, p, 8, 16);
-     TEST_VMOV(q, poly, p, 16, 8);
-+#if defined (FP16_SUPPORTED)
-+    TEST_VMOV(q, float, f, 16, 8);
-+#endif
-     TEST_VMOV(q, float, f, 32, 4);
- 
-+#if defined (FP16_SUPPORTED)
-+    switch (i) {
-+    case 0:
-+      CHECK_RESULTS_NAMED (TEST_MSG, expected0, "");
-+      break;
-+    case 1:
-+      CHECK_RESULTS_NAMED (TEST_MSG, expected1, "");
-+      break;
-+    case 2:
-+      CHECK_RESULTS_NAMED (TEST_MSG, expected2, "");
-+      break;
-+    default:
-+      abort();
-+    }
-+#else
-     switch (i) {
-     case 0:
-       CHECK_RESULTS_NAMED_NO_FP16 (TEST_MSG, expected0, "");
-@@ -243,6 +316,8 @@ void exec_vdup_vmov (void)
-     default:
-       abort();
-     }
-+#endif
-+
-   }
- }
- 
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c
-@@ -17,6 +17,10 @@ VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf7, 0xf7, 0xf7, 0xf7,
- 					0xf7, 0xf7, 0xf7, 0xf7 };
- VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff3, 0xfff3, 0xfff3, 0xfff3 };
- VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1700000, 0xc1700000 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xca80, 0xca80,
-+					       0xca80, 0xca80 };
-+#endif
- VECT_VAR_DECL(expected,int,8,16) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
- 					0xf2, 0xf2, 0xf2, 0xf2,
- 					0xf2, 0xf2, 0xf2, 0xf2,
-@@ -43,10 +47,16 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf5, 0xf5, 0xf5, 0xf5,
- 					 0xf5, 0xf5, 0xf5, 0xf5 };
- VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1,
- 					 0xfff1, 0xfff1, 0xfff1, 0xfff1 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xca80, 0xca80,
-+					       0xca80, 0xca80,
-+					       0xca80, 0xca80,
-+					       0xca80, 0xca80 };
-+#endif
- VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1700000, 0xc1700000,
- 					   0xc1700000, 0xc1700000 };
- 
--#define TEST_MSG "VDUP_LANE/VDUP_LANEQ"
-+#define TEST_MSG "VDUP_LANE/VDUPQ_LANE"
- void exec_vdup_lane (void)
- {
-   /* Basic test: vec1=vdup_lane(vec2, lane), then store the result.  */
-@@ -63,6 +73,9 @@ void exec_vdup_lane (void)
-   clean_results ();
- 
-   TEST_MACRO_64BITS_VARIANTS_2_5(VLOAD, vector, buffer);
-+#if defined (FP16_SUPPORTED)
-+  VLOAD(vector, buffer, , float, f, 16, 4);
-+#endif
-   VLOAD(vector, buffer, , float, f, 32, 2);
- 
-   /* Choose lane arbitrarily.  */
-@@ -76,6 +89,9 @@ void exec_vdup_lane (void)
-   TEST_VDUP_LANE(, uint, u, 64, 1, 1, 0);
-   TEST_VDUP_LANE(, poly, p, 8, 8, 8, 7);
-   TEST_VDUP_LANE(, poly, p, 16, 4, 4, 3);
-+#if defined (FP16_SUPPORTED)
-+  TEST_VDUP_LANE(, float, f, 16, 4, 4, 3);
-+#endif
-   TEST_VDUP_LANE(, float, f, 32, 2, 2, 1);
- 
-   TEST_VDUP_LANE(q, int, s, 8, 16, 8, 2);
-@@ -88,9 +104,133 @@ void exec_vdup_lane (void)
-   TEST_VDUP_LANE(q, uint, u, 64, 2, 1, 0);
-   TEST_VDUP_LANE(q, poly, p, 8, 16, 8, 5);
-   TEST_VDUP_LANE(q, poly, p, 16, 8, 4, 1);
-+#if defined (FP16_SUPPORTED)
-+  TEST_VDUP_LANE(q, float, f, 16, 8, 4, 3);
-+#endif
-   TEST_VDUP_LANE(q, float, f, 32, 4, 2, 1);
- 
-+#if defined (FP16_SUPPORTED)
-+  CHECK_RESULTS (TEST_MSG, "");
-+#else
-   CHECK_RESULTS_NO_FP16 (TEST_MSG, "");
-+#endif
-+
-+#if defined (__aarch64__)
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VDUP_LANEQ/VDUPQ_LANEQ"
-+
-+  /* Expected results for vdup*_laneq tests.  */
-+VECT_VAR_DECL(expected2,int,8,8) [] = { 0xfd, 0xfd, 0xfd, 0xfd,
-+					0xfd, 0xfd, 0xfd, 0xfd };
-+VECT_VAR_DECL(expected2,int,16,4) [] = { 0xfff2, 0xfff2, 0xfff2, 0xfff2 };
-+VECT_VAR_DECL(expected2,int,32,2) [] = { 0xfffffff1, 0xfffffff1 };
-+VECT_VAR_DECL(expected2,int,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(expected2,uint,8,8) [] = { 0xff, 0xff, 0xff, 0xff,
-+					 0xff, 0xff, 0xff, 0xff };
-+VECT_VAR_DECL(expected2,uint,16,4) [] = { 0xfff3, 0xfff3, 0xfff3, 0xfff3 };
-+VECT_VAR_DECL(expected2,uint,32,2) [] = { 0xfffffff1, 0xfffffff1 };
-+VECT_VAR_DECL(expected2,uint,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(expected2,poly,8,8) [] = { 0xf7, 0xf7, 0xf7, 0xf7,
-+					 0xf7, 0xf7, 0xf7, 0xf7 };
-+VECT_VAR_DECL(expected2,poly,16,4) [] = { 0xfff3, 0xfff3, 0xfff3, 0xfff3 };
-+VECT_VAR_DECL(expected2,hfloat,32,2) [] = { 0xc1700000, 0xc1700000 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected2, hfloat, 16, 4) [] = { 0xca80, 0xca80,
-+						0xca80, 0xca80 };
-+#endif
-+VECT_VAR_DECL(expected2,int,8,16) [] = { 0xfb, 0xfb, 0xfb, 0xfb,
-+					 0xfb, 0xfb, 0xfb, 0xfb,
-+					 0xfb, 0xfb, 0xfb, 0xfb,
-+					 0xfb, 0xfb, 0xfb, 0xfb };
-+VECT_VAR_DECL(expected2,int,16,8) [] = { 0xfff7, 0xfff7, 0xfff7, 0xfff7,
-+					 0xfff7, 0xfff7, 0xfff7, 0xfff7 };
-+VECT_VAR_DECL(expected2,int,32,4) [] = { 0xfffffff1, 0xfffffff1,
-+					 0xfffffff1, 0xfffffff1 };
-+VECT_VAR_DECL(expected2,int,64,2) [] = { 0xfffffffffffffff0,
-+					 0xfffffffffffffff0 };
-+VECT_VAR_DECL(expected2,uint,8,16) [] = { 0xf5, 0xf5, 0xf5, 0xf5,
-+					  0xf5, 0xf5, 0xf5, 0xf5,
-+					  0xf5, 0xf5, 0xf5, 0xf5,
-+					  0xf5, 0xf5, 0xf5, 0xf5 };
-+VECT_VAR_DECL(expected2,uint,16,8) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1,
-+					  0xfff1, 0xfff1, 0xfff1, 0xfff1 };
-+VECT_VAR_DECL(expected2,uint,32,4) [] = { 0xfffffff0, 0xfffffff0,
-+					  0xfffffff0, 0xfffffff0 };
-+VECT_VAR_DECL(expected2,uint,64,2) [] = { 0xfffffffffffffff0,
-+					  0xfffffffffffffff0 };
-+VECT_VAR_DECL(expected2,poly,8,16) [] = { 0xf5, 0xf5, 0xf5, 0xf5,
-+					  0xf5, 0xf5, 0xf5, 0xf5,
-+					  0xf5, 0xf5, 0xf5, 0xf5,
-+					  0xf5, 0xf5, 0xf5, 0xf5 };
-+VECT_VAR_DECL(expected2,poly,16,8) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1,
-+					  0xfff1, 0xfff1, 0xfff1, 0xfff1 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected2, hfloat, 16, 8) [] = { 0xc880, 0xc880,
-+						0xc880, 0xc880,
-+						0xc880, 0xc880,
-+						0xc880, 0xc880 };
-+#endif
-+VECT_VAR_DECL(expected2,hfloat,32,4) [] = { 0xc1700000, 0xc1700000,
-+					    0xc1700000, 0xc1700000 };
-+
-+  /* Clean all results for vdup*_laneq tests.  */
-+  clean_results ();
-+  /* Basic test: vec1=vdup_lane(vec2, lane), then store the result.  */
-+#define TEST_VDUP_LANEQ(Q, T1, T2, W, N, N2, L)				\
-+  VECT_VAR(vector_res, T1, W, N) =					\
-+    vdup##Q##_laneq_##T2##W(VECT_VAR(vector, T1, W, N2), L);		\
-+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res, T1, W, N))
-+
-+  /* Input vector can only have 64 bits.  */
-+  DECL_VARIABLE_128BITS_VARIANTS(vector);
-+
-+  clean_results ();
-+
-+  TEST_MACRO_128BITS_VARIANTS_2_5(VLOAD, vector, buffer);
-+#if defined (FP16_SUPPORTED)
-+  VLOAD(vector, buffer, q, float, f, 16, 8);
-+#endif
-+  VLOAD(vector, buffer, q, float, f, 32, 4);
-+
-+  /* Choose lane arbitrarily.  */
-+  TEST_VDUP_LANEQ(, int, s, 8, 8, 16, 13);
-+  TEST_VDUP_LANEQ(, int, s, 16, 4, 8, 2);
-+  TEST_VDUP_LANEQ(, int, s, 32, 2, 4, 1);
-+  TEST_VDUP_LANEQ(, int, s, 64, 1, 2, 0);
-+  TEST_VDUP_LANEQ(, uint, u, 8, 8, 16, 15);
-+  TEST_VDUP_LANEQ(, uint, u, 16, 4, 8, 3);
-+  TEST_VDUP_LANEQ(, uint, u, 32, 2, 4, 1);
-+  TEST_VDUP_LANEQ(, uint, u, 64, 1, 2, 0);
-+  TEST_VDUP_LANEQ(, poly, p, 8, 8, 16, 7);
-+  TEST_VDUP_LANEQ(, poly, p, 16, 4, 8, 3);
-+#if defined (FP16_SUPPORTED)
-+  TEST_VDUP_LANEQ(, float, f, 16, 4, 8, 3);
-+#endif
-+  TEST_VDUP_LANEQ(, float, f, 32, 2, 4, 1);
-+
-+  TEST_VDUP_LANEQ(q, int, s, 8, 16, 16, 11);
-+  TEST_VDUP_LANEQ(q, int, s, 16, 8, 8, 7);
-+  TEST_VDUP_LANEQ(q, int, s, 32, 4, 4, 1);
-+  TEST_VDUP_LANEQ(q, int, s, 64, 2, 2, 0);
-+  TEST_VDUP_LANEQ(q, uint, u, 8, 16, 16, 5);
-+  TEST_VDUP_LANEQ(q, uint, u, 16, 8, 8, 1);
-+  TEST_VDUP_LANEQ(q, uint, u, 32, 4, 4, 0);
-+  TEST_VDUP_LANEQ(q, uint, u, 64, 2, 2, 0);
-+  TEST_VDUP_LANEQ(q, poly, p, 8, 16, 16, 5);
-+  TEST_VDUP_LANEQ(q, poly, p, 16, 8, 8, 1);
-+#if defined (FP16_SUPPORTED)
-+  TEST_VDUP_LANEQ(q, float, f, 16, 8, 8, 7);
-+#endif
-+  TEST_VDUP_LANEQ(q, float, f, 32, 4, 4, 1);
-+
-+  CHECK_RESULTS_NAMED (TEST_MSG, expected2, "");
-+#if defined (FP16_SUPPORTED)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected2, "");
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected2, "");
-+#endif
-+
-+#endif /* __aarch64__.  */
- }
- 
- int main (void)
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vduph_lane.c
-@@ -0,0 +1,137 @@
-+/* { dg-do run } */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+#define A -16
-+#define B -15
-+#define C -14
-+#define D -13
-+#define E -12
-+#define F -11
-+#define G -10
-+#define H -9
-+
-+#define F16_C(a) ((__fp16) a)
-+#define AF F16_C (A)
-+#define BF F16_C (B)
-+#define CF F16_C (C)
-+#define DF F16_C (D)
-+#define EF F16_C (E)
-+#define FF F16_C (F)
-+#define GF F16_C (G)
-+#define HF F16_C (H)
-+
-+#define S16_C(a) ((int16_t) a)
-+#define AS S16_C (A)
-+#define BS S16_C (B)
-+#define CS S16_C (C)
-+#define DS S16_C (D)
-+#define ES S16_C (E)
-+#define FS S16_C (F)
-+#define GS S16_C (G)
-+#define HS S16_C (H)
-+
-+#define U16_C(a) ((int16_t) a)
-+#define AU U16_C (A)
-+#define BU U16_C (B)
-+#define CU U16_C (C)
-+#define DU U16_C (D)
-+#define EU U16_C (E)
-+#define FU U16_C (F)
-+#define GU U16_C (G)
-+#define HU U16_C (H)
-+
-+#define P16_C(a) ((poly16_t) a)
-+#define AP P16_C (A)
-+#define BP P16_C (B)
-+#define CP P16_C (C)
-+#define DP P16_C (D)
-+#define EP P16_C (E)
-+#define FP P16_C (F)
-+#define GP P16_C (G)
-+#define HP P16_C (H)
-+
-+/* Expected results for vduph_lane.  */
-+float16_t expected_f16 = AF;
-+int16_t expected_s16 = DS;
-+uint16_t expected_u16 = BU;
-+poly16_t expected_p16 = CP;
-+
-+/* Expected results for vduph_laneq.  */
-+float16_t expected_q_f16 = EF;
-+int16_t expected_q_s16 = BS;
-+uint16_t expected_q_u16 = GU;
-+poly16_t expected_q_p16 = FP;
-+
-+void exec_vduph_lane_f16 (void)
-+{
-+  /* vduph_lane.  */
-+  DECL_VARIABLE(vsrc, float, 16, 4);
-+  DECL_VARIABLE(vsrc, int, 16, 4);
-+  DECL_VARIABLE(vsrc, uint, 16, 4);
-+  DECL_VARIABLE(vsrc, poly, 16, 4);
-+  VECT_VAR_DECL (buf_src, float, 16, 4) [] = {AF, BF, CF, DF};
-+  VECT_VAR_DECL (buf_src, int, 16, 4) [] = {AS, BS, CS, DS};
-+  VECT_VAR_DECL (buf_src, uint, 16, 4) [] = {AU, BU, CU, DU};
-+  VECT_VAR_DECL (buf_src, poly, 16, 4) [] = {AP, BP, CP, DP};
-+  VLOAD (vsrc, buf_src, , int, s, 16, 4);
-+  VLOAD (vsrc, buf_src, , float, f, 16, 4);
-+  VLOAD (vsrc, buf_src, , uint, u, 16, 4);
-+  VLOAD (vsrc, buf_src, , poly, p, 16, 4);
-+
-+  float16_t res_f = vduph_lane_f16 (VECT_VAR (vsrc, float, 16, 4), 0);
-+  if (* (unsigned short *) &res_f != * (unsigned short *) &expected_f16)
-+    abort ();
-+
-+  int16_t res_s = vduph_lane_s16 (VECT_VAR (vsrc, int, 16, 4), 3);
-+  if (* (unsigned short *) &res_s != * (unsigned short *) &expected_s16)
-+    abort ();
-+
-+  uint16_t res_u = vduph_lane_u16 (VECT_VAR (vsrc, uint, 16, 4), 1);
-+  if (* (unsigned short *) &res_u != * (unsigned short *) &expected_u16)
-+    abort ();
-+
-+  poly16_t res_p = vduph_lane_p16 (VECT_VAR (vsrc, poly, 16, 4), 2);
-+  if (* (unsigned short *) &res_p != * (unsigned short *) &expected_p16)
-+    abort ();
-+
-+  /* vduph_laneq.  */
-+  DECL_VARIABLE(vsrc, float, 16, 8);
-+  DECL_VARIABLE(vsrc, int, 16, 8);
-+  DECL_VARIABLE(vsrc, uint, 16, 8);
-+  DECL_VARIABLE(vsrc, poly, 16, 8);
-+  VECT_VAR_DECL (buf_src, float, 16, 8) [] = {AF, BF, CF, DF, EF, FF, GF, HF};
-+  VECT_VAR_DECL (buf_src, int, 16, 8) [] = {AS, BS, CS, DS, ES, FS, GS, HS};
-+  VECT_VAR_DECL (buf_src, uint, 16, 8) [] = {AU, BU, CU, DU, EU, FU, GU, HU};
-+  VECT_VAR_DECL (buf_src, poly, 16, 8) [] = {AP, BP, CP, DP, EP, FP, GP, HP};
-+  VLOAD (vsrc, buf_src, q, int, s, 16, 8);
-+  VLOAD (vsrc, buf_src, q, float, f, 16, 8);
-+  VLOAD (vsrc, buf_src, q, uint, u, 16, 8);
-+  VLOAD (vsrc, buf_src, q, poly, p, 16, 8);
-+
-+  res_f = vduph_laneq_f16 (VECT_VAR (vsrc, float, 16, 8), 4);
-+  if (* (unsigned short *) &res_f != * (unsigned short *) &expected_q_f16)
-+    abort ();
-+
-+  res_s = vduph_laneq_s16 (VECT_VAR (vsrc, int, 16, 8), 1);
-+  if (* (unsigned short *) &res_s != * (unsigned short *) &expected_q_s16)
-+    abort ();
-+
-+  res_u = vduph_laneq_u16 (VECT_VAR (vsrc, uint, 16, 8), 6);
-+  if (* (unsigned short *) &res_u != * (unsigned short *) &expected_q_u16)
-+    abort ();
-+
-+  res_p = vduph_laneq_p16 (VECT_VAR (vsrc, poly, 16, 8), 5);
-+  if (* (unsigned short *) &res_p != * (unsigned short *) &expected_q_p16)
-+    abort ();
-+}
-+
-+int
-+main (void)
-+{
-+  exec_vduph_lane_f16 ();
-+  return 0;
-+}
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vext.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vext.c
-@@ -16,6 +16,10 @@ VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
- VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf6, 0xf7, 0x55, 0x55,
- 					0x55, 0x55, 0x55, 0x55 };
- VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff2, 0xfff3, 0x66, 0x66 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcb00, 0xca80,
-+					       0x4b4d, 0x4b4d };
-+#endif
- VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1700000, 0x42066666 };
- VECT_VAR_DECL(expected,int,8,16) [] = { 0xfe, 0xff, 0x11, 0x11,
- 					0x11, 0x11, 0x11, 0x11,
-@@ -39,6 +43,12 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0xfc, 0xfd, 0xfe, 0xff,
- 					 0x55, 0x55, 0x55, 0x55 };
- VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff6, 0xfff7, 0x66, 0x66,
- 					 0x66, 0x66, 0x66, 0x66 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xc880, 0x4b4d,
-+					       0x4b4d, 0x4b4d,
-+					       0x4b4d, 0x4b4d,
-+					       0x4b4d, 0x4b4d };
-+#endif
- VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1500000, 0x4204cccd,
- 					   0x4204cccd, 0x4204cccd };
- 
-@@ -60,6 +70,10 @@ void exec_vext (void)
-   clean_results ();
- 
-   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector1, buffer);
-+#ifdef FP16_SUPPORTED
-+  VLOAD(vector1, buffer, , float, f, 16, 4);
-+  VLOAD(vector1, buffer, q, float, f, 16, 8);
-+#endif
-   VLOAD(vector1, buffer, , float, f, 32, 2);
-   VLOAD(vector1, buffer, q, float, f, 32, 4);
- 
-@@ -74,6 +88,9 @@ void exec_vext (void)
-   VDUP(vector2, , uint, u, 64, 1, 0x88);
-   VDUP(vector2, , poly, p, 8, 8, 0x55);
-   VDUP(vector2, , poly, p, 16, 4, 0x66);
-+#if defined (FP16_SUPPORTED)
-+  VDUP (vector2, , float, f, 16, 4, 14.6f);   /* 14.6f is 0x4b4d.  */
-+#endif
-   VDUP(vector2, , float, f, 32, 2, 33.6f);
- 
-   VDUP(vector2, q, int, s, 8, 16, 0x11);
-@@ -86,6 +103,9 @@ void exec_vext (void)
-   VDUP(vector2, q, uint, u, 64, 2, 0x88);
-   VDUP(vector2, q, poly, p, 8, 16, 0x55);
-   VDUP(vector2, q, poly, p, 16, 8, 0x66);
-+#if defined (FP16_SUPPORTED)
-+  VDUP (vector2, q, float, f, 16, 8, 14.6f);
-+#endif
-   VDUP(vector2, q, float, f, 32, 4, 33.2f);
- 
-   /* Choose arbitrary extract offsets.  */
-@@ -99,6 +119,9 @@ void exec_vext (void)
-   TEST_VEXT(, uint, u, 64, 1, 0);
-   TEST_VEXT(, poly, p, 8, 8, 6);
-   TEST_VEXT(, poly, p, 16, 4, 2);
-+#if defined (FP16_SUPPORTED)
-+  TEST_VEXT(, float, f, 16, 4, 2);
-+#endif
-   TEST_VEXT(, float, f, 32, 2, 1);
- 
-   TEST_VEXT(q, int, s, 8, 16, 14);
-@@ -111,9 +134,16 @@ void exec_vext (void)
-   TEST_VEXT(q, uint, u, 64, 2, 1);
-   TEST_VEXT(q, poly, p, 8, 16, 12);
-   TEST_VEXT(q, poly, p, 16, 8, 6);
-+#if defined (FP16_SUPPORTED)
-+  TEST_VEXT(q, float, f, 16, 8, 7);
-+#endif
-   TEST_VEXT(q, float, f, 32, 4, 3);
- 
-+#if defined (FP16_SUPPORTED)
-+  CHECK_RESULTS (TEST_MSG, "");
-+#else
-   CHECK_RESULTS_NO_FP16 (TEST_MSG, "");
-+#endif
- }
- 
- int main (void)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma.c
-@@ -3,11 +3,19 @@
- #include "compute-ref-data.h"
- 
- #ifdef __ARM_FEATURE_FMA
-+
- /* Expected results.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0x61c6, 0x61c8, 0x61ca, 0x61cc };
-+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0x6435, 0x6436, 0x6437, 0x6438,
-+					      0x6439, 0x643a, 0x643b, 0x643c };
-+#endif
- VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x4438ca3d, 0x44390a3d };
--VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x44869eb8, 0x4486beb8, 0x4486deb8, 0x4486feb8 };
-+VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x44869eb8, 0x4486beb8,
-+					   0x4486deb8, 0x4486feb8 };
- #ifdef __aarch64__
--VECT_VAR_DECL(expected,hfloat,64,2) [] = { 0x408906e1532b8520, 0x40890ee1532b8520 };
-+VECT_VAR_DECL(expected,hfloat,64,2) [] = { 0x408906e1532b8520,
-+					   0x40890ee1532b8520 };
- #endif
- 
- #define TEST_MSG "VFMA/VFMAQ"
-@@ -44,6 +52,18 @@ void exec_vfma (void)
-   DECL_VARIABLE(VAR, float, 32, 4);
- #endif
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector1, float, 16, 4);
-+  DECL_VARIABLE(vector2, float, 16, 4);
-+  DECL_VARIABLE(vector3, float, 16, 4);
-+  DECL_VARIABLE(vector_res, float, 16, 4);
-+
-+  DECL_VARIABLE(vector1, float, 16, 8);
-+  DECL_VARIABLE(vector2, float, 16, 8);
-+  DECL_VARIABLE(vector3, float, 16, 8);
-+  DECL_VARIABLE(vector_res, float, 16, 8);
-+#endif
-+
-   DECL_VFMA_VAR(vector1);
-   DECL_VFMA_VAR(vector2);
-   DECL_VFMA_VAR(vector3);
-@@ -52,6 +72,10 @@ void exec_vfma (void)
-   clean_results ();
- 
-   /* Initialize input "vector1" from "buffer".  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VLOAD(vector1, buffer, , float, f, 16, 4);
-+  VLOAD(vector1, buffer, q, float, f, 16, 8);
-+#endif
-   VLOAD(vector1, buffer, , float, f, 32, 2);
-   VLOAD(vector1, buffer, q, float, f, 32, 4);
- #ifdef __aarch64__
-@@ -59,13 +83,21 @@ void exec_vfma (void)
- #endif
- 
-   /* Choose init value arbitrarily.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector2, , float, f, 16, 4, 9.3f);
-+  VDUP(vector2, q, float, f, 16, 8, 29.7f);
-+#endif
-   VDUP(vector2, , float, f, 32, 2, 9.3f);
-   VDUP(vector2, q, float, f, 32, 4, 29.7f);
- #ifdef __aarch64__
-   VDUP(vector2, q, float, f, 64, 2, 15.8f);
- #endif
--  
-+
-   /* Choose init value arbitrarily.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector3, , float, f, 16, 4, 81.2f);
-+  VDUP(vector3, q, float, f, 16, 8, 36.8f);
-+#endif
-   VDUP(vector3, , float, f, 32, 2, 81.2f);
-   VDUP(vector3, q, float, f, 32, 4, 36.8f);
- #ifdef __aarch64__
-@@ -73,12 +105,20 @@ void exec_vfma (void)
- #endif
- 
-   /* Execute the tests.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VFMA(, float, f, 16, 4);
-+  TEST_VFMA(q, float, f, 16, 8);
-+#endif
-   TEST_VFMA(, float, f, 32, 2);
-   TEST_VFMA(q, float, f, 32, 4);
- #ifdef __aarch64__
-   TEST_VFMA(q, float, f, 64, 2);
- #endif
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, "");
-+#endif
-   CHECK_VFMA_RESULTS (TEST_MSG, "");
- }
- #endif
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfmah_f16_1.c
-@@ -0,0 +1,40 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected[] =
-+{
-+ 0x0000 /* 0.000000 */,
-+ 0x0000 /* 0.000000 */,
-+ 0x3944 /* 0.658203 */,
-+ 0xcefa /* -27.906250 */,
-+ 0x5369 /* 59.281250 */,
-+ 0x35ba /* 0.357910 */,
-+ 0xc574 /* -5.453125 */,
-+ 0xc5e6 /* -5.898438 */,
-+ 0x3f66 /* 1.849609 */,
-+ 0x5665 /* 102.312500 */,
-+ 0xc02d /* -2.087891 */,
-+ 0x4d79 /* 21.890625 */,
-+ 0x547b /* 71.687500 */,
-+ 0xcdf0 /* -23.750000 */,
-+ 0xc625 /* -6.144531 */,
-+ 0x4cf9 /* 19.890625 */,
-+ 0x7e00 /* nan */,
-+ 0x7e00 /* nan */
-+};
-+
-+#define TEST_MSG "VFMAH_F16"
-+#define INSN_NAME vfmah_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "ternary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfmas_lane_f16_1.c
-@@ -0,0 +1,908 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_neon } */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+#define FP16_C(a) ((__fp16) a)
-+#define A0 FP16_C (123.4)
-+#define A1 FP16_C (-5.8)
-+#define A2 FP16_C (-0.0)
-+#define A3 FP16_C (10)
-+#define A4 FP16_C (123412.43)
-+#define A5 FP16_C (-5.8)
-+#define A6 FP16_C (90.8)
-+#define A7 FP16_C (24)
-+
-+#define B0 FP16_C (23.4)
-+#define B1 FP16_C (-5.8)
-+#define B2 FP16_C (8.9)
-+#define B3 FP16_C (4.0)
-+#define B4 FP16_C (3.4)
-+#define B5 FP16_C (-550.8)
-+#define B6 FP16_C (-31.8)
-+#define B7 FP16_C (20000.0)
-+
-+/* Expected results for vfma_lane.  */
-+VECT_VAR_DECL (expected0_static, hfloat, 16, 4) []
-+  = { 0x613E /* A0 + B0 * B0.  */,
-+      0xD86D /* A1 + B1 * B0.  */,
-+      0x5A82 /* A2 + B2 * B0.  */,
-+      0x567A /* A3 + B3 * B0.  */};
-+
-+VECT_VAR_DECL (expected1_static, hfloat, 16, 4) []
-+  = { 0xCA33 /* A0 + B0 * B1.  */,
-+      0x4EF6 /* A1 + B1 * B1.  */,
-+      0xD274 /* A2 + B2 * B1.  */,
-+      0xCA9A /* A3 + B3 * B1.  */ };
-+
-+VECT_VAR_DECL (expected2_static, hfloat, 16, 4) []
-+  = { 0x5D2F /* A0 + B0 * B2.  */,
-+      0xD32D /* A1 + B1 * B2.  */,
-+      0x54F3 /* A2 + B2 * B2.  */,
-+      0x51B3 /* A3 + B3 * B2.  */ };
-+
-+VECT_VAR_DECL (expected3_static, hfloat, 16, 4) []
-+  = { 0x5AC8 /* A0 + B0 * B3.  */,
-+      0xCF40 /* A1 + B1 * B3.  */,
-+      0x5073 /* A2 + B2 * B3.  */,
-+      0x4E80 /* A3 + B3 * B3.  */ };
-+
-+/* Expected results for vfmaq_lane.  */
-+VECT_VAR_DECL (expected0_static, hfloat, 16, 8) []
-+  = { 0x613E /* A0 + B0 * B0.  */,
-+      0xD86D /* A1 + B1 * B0.  */,
-+      0x5A82 /* A2 + B2 * B0.  */,
-+      0x567A /* A3 + B3 * B0.  */,
-+      0x7C00 /* A4 + B4 * B0.  */,
-+      0xF24D /* A5 + B5 * B0.  */,
-+      0xE11B /* A6 + B6 * B0.  */,
-+      0x7C00 /* A7 + B7 * B0.  */ };
-+
-+VECT_VAR_DECL (expected1_static, hfloat, 16, 8) []
-+  = { 0xCA33 /* A0 + B0 * B1.  */,
-+      0x4EF6 /* A1 + B1 * B1.  */,
-+      0xD274 /* A2 + B2 * B1.  */,
-+      0xCA9A /* A3 + B3 * B1.  */,
-+      0x7C00 /* A4 + B4 * B1.  */,
-+      0x6A3B /* A5 + B5 * B1.  */,
-+      0x5C4D /* A6 + B6 * B1.  */,
-+      0xFC00 /* A7 + B7 * B1.  */ };
-+
-+VECT_VAR_DECL (expected2_static, hfloat, 16, 8) []
-+  = { 0x5D2F /* A0 + B0 * B2.  */,
-+      0xD32D /* A1 + B1 * B2.  */,
-+      0x54F3 /* A2 + B2 * B2.  */,
-+      0x51B3 /* A3 + B3 * B2.  */,
-+      0x7C00 /* A4 + B4 * B2.  */,
-+      0xECCB /* A5 + B5 * B2.  */,
-+      0xDA01 /* A6 + B6 * B2.  */,
-+      0x7C00 /* A7 + B7 * B2.  */ };
-+
-+VECT_VAR_DECL (expected3_static, hfloat, 16, 8) []
-+  = { 0x5AC8 /* A0 + B0 * B3.  */,
-+      0xCF40 /* A1 + B1 * B3.  */,
-+      0x5073 /* A2 + B2 * B3.  */,
-+      0x4E80 /* A3 + B3 * B3.  */,
-+      0x7C00 /* A4 + B4 * B3.  */,
-+      0xE851 /* A5 + B5 * B3.  */,
-+      0xD08C /* A6 + B6 * B3.  */,
-+      0x7C00 /* A7 + B7 * B3.  */ };
-+
-+/* Expected results for vfma_laneq.  */
-+VECT_VAR_DECL (expected0_laneq_static, hfloat, 16, 4) []
-+  = { 0x613E /* A0 + B0 * B0.  */,
-+      0xD86D /* A1 + B1 * B0.  */,
-+      0x5A82 /* A2 + B2 * B0.  */,
-+      0x567A /* A3 + B3 * B0.  */ };
-+
-+VECT_VAR_DECL (expected1_laneq_static, hfloat, 16, 4) []
-+  = { 0xCA33 /* A0 + B0 * B1.  */,
-+      0x4EF6 /* A1 + B1 * B1.  */,
-+      0xD274 /* A2 + B2 * B1.  */,
-+      0xCA9A /* A3 + B3 * B1.  */ };
-+
-+VECT_VAR_DECL (expected2_laneq_static, hfloat, 16, 4) []
-+  = { 0x5D2F /* A0 + B0 * B2.  */,
-+      0xD32D /* A1 + B1 * B2.  */,
-+      0x54F3 /* A2 + B2 * B2.  */,
-+      0x51B3 /* A3 + B3 * B2.  */ };
-+
-+VECT_VAR_DECL (expected3_laneq_static, hfloat, 16, 4) []
-+  = { 0x5AC8 /* A0 + B0 * B3.  */,
-+      0xCF40 /* A1 + B1 * B3.  */,
-+      0x5073 /* A2 + B2 * B3.  */,
-+      0x4E80 /* A3 + B3 * B3.  */ };
-+
-+VECT_VAR_DECL (expected4_laneq_static, hfloat, 16, 4) []
-+  = { 0x5A58 /* A0 + B0 * B4.  */,
-+      0xCE62 /* A1 + B1 * B4.  */,
-+      0x4F91 /* A2 + B2 * B4.  */,
-+      0x4DE6 /* A3 + B3 * B4.  */ };
-+
-+VECT_VAR_DECL (expected5_laneq_static, hfloat, 16, 4) []
-+  = { 0xF23D /* A0 + B0 * B5.  */,
-+      0x6A3B /* A1 + B1 * B5.  */,
-+      0xECCA /* A2 + B2 * B5.  */,
-+      0xE849 /* A3 + B3 * B5.  */ };
-+
-+VECT_VAR_DECL (expected6_laneq_static, hfloat, 16, 4) []
-+  = { 0xE0DA /* A0 + B0 * B6.  */,
-+      0x5995 /* A1 + B1 * B6.  */,
-+      0xDC6C /* A2 + B2 * B6.  */,
-+      0xD753 /* A3 + B3 * B6.  */ };
-+
-+VECT_VAR_DECL (expected7_laneq_static, hfloat, 16, 4) []
-+  = { 0x7C00 /* A0 + B0 * B7.  */,
-+      0xFC00 /* A1 + B1 * B7.  */,
-+      0x7C00 /* A2 + B2 * B7.  */,
-+      0x7C00 /* A3 + B3 * B7.  */ };
-+
-+/* Expected results for vfmaq_laneq.  */
-+VECT_VAR_DECL (expected0_laneq_static, hfloat, 16, 8) []
-+  = { 0x613E /* A0 + B0 * B0.  */,
-+      0xD86D /* A1 + B1 * B0.  */,
-+      0x5A82 /* A2 + B2 * B0.  */,
-+      0x567A /* A3 + B3 * B0.  */,
-+      0x7C00 /* A4 + B4 * B0.  */,
-+      0xF24D /* A5 + B5 * B0.  */,
-+      0xE11B /* A6 + B6 * B0.  */,
-+      0x7C00 /* A7 + B7 * B0.  */ };
-+
-+VECT_VAR_DECL (expected1_laneq_static, hfloat, 16, 8) []
-+  = { 0xCA33 /* A0 + B0 * B1.  */,
-+      0x4EF6 /* A1 + B1 * B1.  */,
-+      0xD274 /* A2 + B2 * B1.  */,
-+      0xCA9A /* A3 + B3 * B1.  */,
-+      0x7C00 /* A4 + B4 * B1.  */,
-+      0x6A3B /* A5 + B5 * B1.  */,
-+      0x5C4D /* A6 + B6 * B1.  */,
-+      0xFC00 /* A7 + B7 * B1.  */ };
-+
-+VECT_VAR_DECL (expected2_laneq_static, hfloat, 16, 8) []
-+  = { 0x5D2F /* A0 + B0 * B2.  */,
-+      0xD32D /* A1 + B1 * B2.  */,
-+      0x54F3 /* A2 + B2 * B2.  */,
-+      0x51B3 /* A3 + B3 * B2.  */,
-+      0x7C00 /* A4 + B4 * B2.  */,
-+      0xECCB /* A5 + B5 * B2.  */,
-+      0xDA01 /* A6 + B6 * B2.  */,
-+      0x7C00 /* A7 + B7 * B2.  */ };
-+
-+VECT_VAR_DECL (expected3_laneq_static, hfloat, 16, 8) []
-+  = { 0x5AC8 /* A0 + B0 * B3.  */,
-+      0xCF40 /* A1 + B1 * B3.  */,
-+      0x5073 /* A2 + B2 * B3.  */,
-+      0x4E80 /* A3 + B3 * B3.  */,
-+      0x7C00 /* A4 + B4 * B3.  */,
-+      0xE851 /* A5 + B5 * B3.  */,
-+      0xD08C /* A6 + B6 * B3.  */,
-+      0x7C00 /* A7 + B7 * B3.  */ };
-+
-+VECT_VAR_DECL (expected4_laneq_static, hfloat, 16, 8) []
-+  = { 0x5A58 /* A0 + B0 * B4.  */,
-+      0xCE62 /* A1 + B1 * B4.  */,
-+      0x4F91 /* A2 + B2 * B4.  */,
-+      0x4DE6 /* A3 + B3 * B4.  */,
-+      0x7C00 /* A4 + B4 * B4.  */,
-+      0xE757 /* A5 + B5 * B4.  */,
-+      0xCC54 /* A6 + B6 * B4.  */,
-+      0x7C00 /* A7 + B7 * B4.  */ };
-+
-+VECT_VAR_DECL (expected5_laneq_static, hfloat, 16, 8) []
-+  = { 0xF23D /* A0 + B0 * B5.  */,
-+      0x6A3B /* A1 + B1 * B5.  */,
-+      0xECCA /* A2 + B2 * B5.  */,
-+      0xE849 /* A3 + B3 * B5.  */,
-+      0x7C00 /* A4 + B4 * B5.  */,
-+      0x7C00 /* A5 + B5 * B5.  */,
-+      0x744D /* A6 + B6 * B5.  */,
-+      0xFC00 /* A7 + B7 * B5.  */ };
-+
-+VECT_VAR_DECL (expected6_laneq_static, hfloat, 16, 8) []
-+  = { 0xE0DA /* A0 + B0 * B6.  */,
-+      0x5995 /* A1 + B1 * B6.  */,
-+      0xDC6C /* A2 + B2 * B6.  */,
-+      0xD753 /* A3 + B3 * B6.  */,
-+      0x7C00 /* A4 + B4 * B6.  */,
-+      0x7447 /* A5 + B5 * B6.  */,
-+      0x644E /* A6 + B6 * B6.  */,
-+      0xFC00 /* A7 + B7 * B6.  */ };
-+
-+VECT_VAR_DECL (expected7_laneq_static, hfloat, 16, 8) []
-+  = { 0x7C00 /* A0 + B0 * B7.  */,
-+      0xFC00 /* A1 + B1 * B7.  */,
-+      0x7C00 /* A2 + B2 * B7.  */,
-+      0x7C00 /* A3 + B3 * B7.  */,
-+      0x7C00 /* A4 + B4 * B7.  */,
-+      0xFC00 /* A5 + B5 * B7.  */,
-+      0xFC00 /* A6 + B6 * B7.  */,
-+      0x7C00 /* A7 + B7 * B7.  */ };
-+
-+/* Expected results for vfms_lane.  */
-+VECT_VAR_DECL (expected0_fms_static, hfloat, 16, 4) []
-+  = { 0xDEA2 /* A0 + (-B0) * B0.  */,
-+      0x5810 /* A1 + (-B1) * B0.  */,
-+      0xDA82 /* A2 + (-B2) * B0.  */,
-+      0xD53A /* A3 + (-B3) * B0.  */ };
-+
-+VECT_VAR_DECL (expected1_fms_static, hfloat, 16, 4) []
-+  = { 0x5C0D /* A0 + (-B0) * B1.  */,
-+      0xD0EE /* A1 + (-B1) * B1.  */,
-+      0x5274 /* A2 + (-B2) * B1.  */,
-+      0x5026 /* A3 + (-B3) * B1.  */ };
-+
-+VECT_VAR_DECL (expected2_fms_static, hfloat, 16, 4) []
-+  = { 0xD54E /* A0 + (-B0) * B2.  */,
-+      0x51BA /* A1 + (-B1) * B2.  */,
-+      0xD4F3 /* A2 + (-B2) * B2.  */,
-+      0xCE66 /* A3 + (-B3) * B2.  */ };
-+
-+VECT_VAR_DECL (expected3_fms_static, hfloat, 16, 4) []
-+  = { 0x4F70 /* A0 + (-B0) * B3.  */,
-+      0x4C5A /* A1 + (-B1) * B3.  */,
-+      0xD073 /* A2 + (-B2) * B3.  */,
-+      0xC600 /* A3 + (-B3) * B3.  */ };
-+
-+/* Expected results for vfmsq_lane.  */
-+VECT_VAR_DECL (expected0_fms_static, hfloat, 16, 8) []
-+  = { 0xDEA2 /* A0 + (-B0) * B0.  */,
-+      0x5810 /* A1 + (-B1) * B0.  */,
-+      0xDA82 /* A2 + (-B2) * B0.  */,
-+      0xD53A /* A3 + (-B3) * B0.  */,
-+      0x7C00 /* A4 + (-B4) * B0.  */,
-+      0x724B /* A5 + (-B5) * B0.  */,
-+      0x6286 /* A6 + (-B6) * B0.  */,
-+      0xFC00 /* A7 + (-B7) * B0.  */ };
-+
-+VECT_VAR_DECL (expected1_fms_static, hfloat, 16, 8) []
-+  = { 0x5C0D /* A0 + (-B0) * B1.  */,
-+      0xD0EE /* A1 + (-B1) * B1.  */,
-+      0x5274 /* A2 + (-B2) * B1.  */,
-+      0x5026 /* A3 + (-B3) * B1.  */,
-+      0x7C00 /* A4 + (-B4) * B1.  */,
-+      0xEA41 /* A5 + (-B5) * B1.  */,
-+      0xD5DA /* A6 + (-B6) * B1.  */,
-+      0x7C00 /* A7 + (-B7) * B1.  */ };
-+
-+VECT_VAR_DECL (expected2_fms_static, hfloat, 16, 8) []
-+  = { 0xD54E /* A0 + (-B0) * B2.  */,
-+      0x51BA /* A1 + (-B1) * B2.  */,
-+      0xD4F3 /* A2 + (-B2) * B2.  */,
-+      0xCE66 /* A3 + (-B3) * B2.  */,
-+      0x7C00 /* A4 + (-B4) * B2.  */,
-+      0x6CC8 /* A5 + (-B5) * B2.  */,
-+      0x5DD7 /* A6 + (-B6) * B2.  */,
-+      0xFC00 /* A7 + (-B7) * B2.  */ };
-+
-+VECT_VAR_DECL (expected3_fms_static, hfloat, 16, 8) []
-+  = { 0x4F70 /* A0 + (-B0) * B3.  */,
-+      0x4C5A /* A1 + (-B1) * B3.  */,
-+      0xD073 /* A2 + (-B2) * B3.  */,
-+      0xC600 /* A3 + (-B3) * B3.  */,
-+      0x7C00 /* A4 + (-B4) * B3.  */,
-+      0x684B /* A5 + (-B5) * B3.  */,
-+      0x5AD0 /* A6 + (-B6) * B3.  */,
-+      0xFC00 /* A7 + (-B7) * B3.  */ };
-+
-+/* Expected results for vfms_laneq.  */
-+VECT_VAR_DECL (expected0_fms_laneq_static, hfloat, 16, 4) []
-+  = { 0xDEA2 /* A0 + (-B0) * B0.  */,
-+      0x5810 /* A1 + (-B1) * B0.  */,
-+      0xDA82 /* A2 + (-B2) * B0.  */,
-+      0xD53A /* A3 + (-B3) * B0.  */ };
-+
-+VECT_VAR_DECL (expected1_fms_laneq_static, hfloat, 16, 4) []
-+  = { 0x5C0D /* A0 + (-B0) * B1.  */,
-+      0xD0EE /* A1 + (-B1) * B1.  */,
-+      0x5274 /* A2 + (-B2) * B1.  */,
-+      0x5026 /* A3 + (-B3) * B1.  */ };
-+
-+VECT_VAR_DECL (expected2_fms_laneq_static, hfloat, 16, 4) []
-+  = { 0xD54E /* A0 + (-B0) * B2.  */,
-+      0x51BA /* A1 + (-B1) * B2.  */,
-+      0xD4F3 /* A2 + (-B2) * B2.  */,
-+      0xCE66 /* A3 + (-B3) * B2.  */ };
-+
-+VECT_VAR_DECL (expected3_fms_laneq_static, hfloat, 16, 4) []
-+  = { 0x4F70 /* A0 + (-B0) * B3.  */,
-+      0x4C5A /* A1 + (-B1) * B3.  */,
-+      0xD073 /* A2 + (-B2) * B3.  */,
-+      0xC600 /* A3 + (-B3) * B3.  */ };
-+
-+VECT_VAR_DECL (expected4_fms_laneq_static, hfloat, 16, 4) []
-+  = { 0x5179 /* A0 + (-B0) * B4.  */,
-+      0x4AF6 /* A1 + (-B1) * B4.  */,
-+      0xCF91 /* A2 + (-B2) * B4.  */,
-+      0xC334 /* A3 + (-B3) * B4.  */ };
-+
-+VECT_VAR_DECL (expected5_fms_laneq_static, hfloat, 16, 4) []
-+  = { 0x725C /* A0 + (-B0) * B5.  */,
-+      0xEA41 /* A1 + (-B1) * B5.  */,
-+      0x6CCA /* A2 + (-B2) * B5.  */,
-+      0x6853 /* A3 + (-B3) * B5.  */ };
-+
-+VECT_VAR_DECL (expected6_fms_laneq_static, hfloat, 16, 4) []
-+  = { 0x62C7 /* A0 + (-B0) * B6.  */,
-+      0xD9F2 /* A1 + (-B1) * B6.  */,
-+      0x5C6C /* A2 + (-B2) * B6.  */,
-+      0x584A /* A3 + (-B3) * B6.  */ };
-+
-+VECT_VAR_DECL (expected7_fms_laneq_static, hfloat, 16, 4) []
-+  = { 0xFC00 /* A0 + (-B0) * B7.  */,
-+      0x7C00 /* A1 + (-B1) * B7.  */,
-+      0xFC00 /* A2 + (-B2) * B7.  */,
-+      0xFC00 /* A3 + (-B3) * B7.  */ };
-+
-+/* Expected results for vfmsq_laneq.  */
-+VECT_VAR_DECL (expected0_fms_laneq_static, hfloat, 16, 8) []
-+  = { 0xDEA2 /* A0 + (-B0) * B0.  */,
-+      0x5810 /* A1 + (-B1) * B0.  */,
-+      0xDA82 /* A2 + (-B2) * B0.  */,
-+      0xD53A /* A3 + (-B3) * B0.  */,
-+      0x7C00 /* A4 + (-B4) * B0.  */,
-+      0x724B /* A5 + (-B5) * B0.  */,
-+      0x6286 /* A6 + (-B6) * B0.  */,
-+      0xFC00 /* A7 + (-B7) * B0.  */ };
-+
-+VECT_VAR_DECL (expected1_fms_laneq_static, hfloat, 16, 8) []
-+  = { 0x5C0D /* A0 + (-B0) * B1.  */,
-+      0xD0EE /* A1 + (-B1) * B1.  */,
-+      0x5274 /* A2 + (-B2) * B1.  */,
-+      0x5026 /* A3 + (-B3) * B1.  */,
-+      0x7C00 /* A4 + (-B4) * B1.  */,
-+      0xEA41 /* A5 + (-B5) * B1.  */,
-+      0xD5DA /* A6 + (-B6) * B1.  */,
-+      0x7C00 /* A7 + (-B7) * B1.  */ };
-+
-+VECT_VAR_DECL (expected2_fms_laneq_static, hfloat, 16, 8) []
-+  = { 0xD54E /* A0 + (-B0) * B2.  */,
-+      0x51BA /* A1 + (-B1) * B2.  */,
-+      0xD4F3 /* A2 + (-B2) * B2.  */,
-+      0xCE66 /* A3 + (-B3) * B2.  */,
-+      0x7C00 /* A4 + (-B4) * B2.  */,
-+      0x6CC8 /* A5 + (-B5) * B2.  */,
-+      0x5DD7 /* A6 + (-B6) * B2.  */,
-+      0xFC00 /* A7 + (-B7) * B2.  */ };
-+
-+VECT_VAR_DECL (expected3_fms_laneq_static, hfloat, 16, 8) []
-+  = { 0x4F70 /* A0 + (-B0) * B3.  */,
-+      0x4C5A /* A1 + (-B1) * B3.  */,
-+      0xD073 /* A2 + (-B2) * B3.  */,
-+      0xC600 /* A3 + (-B3) * B3.  */,
-+      0x7C00 /* A4 + (-B4) * B3.  */,
-+      0x684B /* A5 + (-B5) * B3.  */,
-+      0x5AD0 /* A6 + (-B6) * B3.  */,
-+      0xFC00 /* A7 + (-B7) * B3.  */ };
-+
-+VECT_VAR_DECL (expected4_fms_laneq_static, hfloat, 16, 8) []
-+  = { 0x5179 /* A0 + (-B0) * B4.  */,
-+      0x4AF6 /* A1 + (-B1) * B4.  */,
-+      0xCF91 /* A2 + (-B2) * B4.  */,
-+      0xC334 /* A3 + (-B3) * B4.  */,
-+      0x7C00 /* A4 + (-B4) * B4.  */,
-+      0x674C /* A5 + (-B5) * B4.  */,
-+      0x5A37 /* A6 + (-B6) * B4.  */,
-+      0xFC00 /* A7 + (-B7) * B4.  */ };
-+
-+VECT_VAR_DECL (expected5_fms_laneq_static, hfloat, 16, 8) []
-+  = { 0x725C /* A0 + (-B0) * B5.  */,
-+      0xEA41 /* A1 + (-B1) * B5.  */,
-+      0x6CCA /* A2 + (-B2) * B5.  */,
-+      0x6853 /* A3 + (-B3) * B5.  */,
-+      0x7C00 /* A4 + (-B4) * B5.  */,
-+      0xFC00 /* A5 + (-B5) * B5.  */,
-+      0xF441 /* A6 + (-B6) * B5.  */,
-+      0x7C00 /* A7 + (-B7) * B5.  */ };
-+
-+VECT_VAR_DECL (expected6_fms_laneq_static, hfloat, 16, 8) []
-+  = { 0x62C7 /* A0 + (-B0) * B6.  */,
-+      0xD9F2 /* A1 + (-B1) * B6.  */,
-+      0x5C6C /* A2 + (-B2) * B6.  */,
-+      0x584A /* A3 + (-B3) * B6.  */,
-+      0x7C00 /* A4 + (-B4) * B6.  */,
-+      0xF447 /* A5 + (-B5) * B6.  */,
-+      0xE330 /* A6 + (-B6) * B6.  */,
-+      0x7C00 /* A7 + (-B7) * B6.  */ };
-+
-+VECT_VAR_DECL (expected7_fms_laneq_static, hfloat, 16, 8) []
-+  = { 0xFC00 /* A0 + (-B0) * B7.  */,
-+      0x7C00 /* A1 + (-B1) * B7.  */,
-+      0xFC00 /* A2 + (-B2) * B7.  */,
-+      0xFC00 /* A3 + (-B3) * B7.  */,
-+      0x7C00 /* A4 + (-B4) * B7.  */,
-+      0x7C00 /* A5 + (-B5) * B7.  */,
-+      0x7C00 /* A6 + (-B6) * B7.  */,
-+      0xFC00 /* A7 + (-B7) * B7.  */ };
-+
-+void exec_vfmas_lane_f16 (void)
-+{
-+#undef TEST_MSG
-+#define TEST_MSG "VFMA_LANE (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc_1, float, 16, 4);
-+  DECL_VARIABLE(vsrc_2, float, 16, 4);
-+  VECT_VAR_DECL (buf_src_1, float, 16, 4) [] = {A0, A1, A2, A3};
-+  VECT_VAR_DECL (buf_src_2, float, 16, 4) [] = {B0, B1, B2, B3};
-+  VLOAD (vsrc_1, buf_src_1, , float, f, 16, 4);
-+  VLOAD (vsrc_2, buf_src_2, , float, f, 16, 4);
-+  DECL_VARIABLE (vector_res, float, 16, 4)
-+    = vfma_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		     VECT_VAR (vsrc_2, float, 16, 4),
-+		     VECT_VAR (vsrc_2, float, 16, 4), 0);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected0_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfma_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		     VECT_VAR (vsrc_2, float, 16, 4),
-+		     VECT_VAR (vsrc_2, float, 16, 4), 1);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected1_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfma_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		     VECT_VAR (vsrc_2, float, 16, 4),
-+		     VECT_VAR (vsrc_2, float, 16, 4), 2);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected2_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfma_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		     VECT_VAR (vsrc_2, float, 16, 4),
-+		     VECT_VAR (vsrc_2, float, 16, 4), 3);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected3_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VFMAQ_LANE (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc_1, float, 16, 8);
-+  DECL_VARIABLE(vsrc_2, float, 16, 8);
-+  VECT_VAR_DECL (buf_src_1, float, 16, 8) [] = {A0, A1, A2, A3, A4, A5, A6, A7};
-+  VECT_VAR_DECL (buf_src_2, float, 16, 8) [] = {B0, B1, B2, B3, B4, B5, B6, B7};
-+  VLOAD (vsrc_1, buf_src_1, q, float, f, 16, 8);
-+  VLOAD (vsrc_2, buf_src_2, q, float, f, 16, 8);
-+  DECL_VARIABLE (vector_res, float, 16, 8)
-+    = vfmaq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		      VECT_VAR (vsrc_2, float, 16, 8),
-+		      VECT_VAR (vsrc_2, float, 16, 4), 0);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected0_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmaq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		      VECT_VAR (vsrc_2, float, 16, 8),
-+		      VECT_VAR (vsrc_2, float, 16, 4), 1);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected1_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmaq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		      VECT_VAR (vsrc_2, float, 16, 8),
-+		      VECT_VAR (vsrc_2, float, 16, 4), 2);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected2_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmaq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		      VECT_VAR (vsrc_2, float, 16, 8),
-+		      VECT_VAR (vsrc_2, float, 16, 4), 3);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected3_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VFMA_LANEQ (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc_3, float, 16, 8);
-+  VECT_VAR_DECL (buf_src_3, float, 16, 8) [] = {B0, B1, B2, B3, B4, B5, B6, B7};
-+  VLOAD (vsrc_3, buf_src_3, q, float, f, 16, 8);
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfma_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 4),
-+		      VECT_VAR (vsrc_3, float, 16, 8), 0);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected0_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfma_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 4),
-+		      VECT_VAR (vsrc_3, float, 16, 8), 1);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected1_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfma_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 4),
-+		      VECT_VAR (vsrc_3, float, 16, 8), 2);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected2_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfma_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 4),
-+		      VECT_VAR (vsrc_3, float, 16, 8), 3);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected3_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfma_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 4),
-+		      VECT_VAR (vsrc_3, float, 16, 8), 4);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected4_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfma_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 4),
-+		      VECT_VAR (vsrc_3, float, 16, 8), 5);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected5_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfma_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 4),
-+		      VECT_VAR (vsrc_3, float, 16, 8), 6);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected6_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfma_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 4),
-+		      VECT_VAR (vsrc_3, float, 16, 8), 7);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected7_laneq_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VFMAQ_LANEQ (FP16)"
-+  clean_results ();
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmaq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 8),
-+		       VECT_VAR (vsrc_3, float, 16, 8), 0);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected0_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmaq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 8),
-+		       VECT_VAR (vsrc_3, float, 16, 8), 1);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected1_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmaq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 8),
-+		       VECT_VAR (vsrc_3, float, 16, 8), 2);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected2_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmaq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 8),
-+		       VECT_VAR (vsrc_3, float, 16, 8), 3);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected3_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmaq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 8),
-+		       VECT_VAR (vsrc_3, float, 16, 8), 4);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected4_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmaq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 8),
-+		       VECT_VAR (vsrc_3, float, 16, 8), 5);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected5_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmaq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 8),
-+		       VECT_VAR (vsrc_3, float, 16, 8), 6);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected6_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmaq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 8),
-+		       VECT_VAR (vsrc_3, float, 16, 8), 7);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected7_laneq_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VFMS_LANE (FP16)"
-+  clean_results ();
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfms_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		     VECT_VAR (vsrc_2, float, 16, 4),
-+		     VECT_VAR (vsrc_2, float, 16, 4), 0);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected0_fms_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfms_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		     VECT_VAR (vsrc_2, float, 16, 4),
-+		     VECT_VAR (vsrc_2, float, 16, 4), 1);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected1_fms_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfms_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		     VECT_VAR (vsrc_2, float, 16, 4),
-+		     VECT_VAR (vsrc_2, float, 16, 4), 2);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected2_fms_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfms_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		     VECT_VAR (vsrc_2, float, 16, 4),
-+		     VECT_VAR (vsrc_2, float, 16, 4), 3);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected3_fms_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VFMSQ_LANE (FP16)"
-+  clean_results ();
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmsq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		      VECT_VAR (vsrc_2, float, 16, 8),
-+		      VECT_VAR (vsrc_2, float, 16, 4), 0);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected0_fms_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmsq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		      VECT_VAR (vsrc_2, float, 16, 8),
-+		      VECT_VAR (vsrc_2, float, 16, 4), 1);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected1_fms_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmsq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		      VECT_VAR (vsrc_2, float, 16, 8),
-+		      VECT_VAR (vsrc_2, float, 16, 4), 2);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected2_fms_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmsq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		      VECT_VAR (vsrc_2, float, 16, 8),
-+		      VECT_VAR (vsrc_2, float, 16, 4), 3);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected3_fms_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VFMS_LANEQ (FP16)"
-+  clean_results ();
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfms_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 4),
-+		      VECT_VAR (vsrc_3, float, 16, 8), 0);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected0_fms_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfms_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 4),
-+		      VECT_VAR (vsrc_3, float, 16, 8), 1);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected1_fms_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfms_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 4),
-+		      VECT_VAR (vsrc_3, float, 16, 8), 2);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected2_fms_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfms_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 4),
-+		      VECT_VAR (vsrc_3, float, 16, 8), 3);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected3_fms_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfms_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 4),
-+		      VECT_VAR (vsrc_3, float, 16, 8), 4);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected4_fms_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfms_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 4),
-+		      VECT_VAR (vsrc_3, float, 16, 8), 5);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected5_fms_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfms_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 4),
-+		      VECT_VAR (vsrc_3, float, 16, 8), 6);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected6_fms_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfms_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 4),
-+		      VECT_VAR (vsrc_3, float, 16, 8), 7);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected7_fms_laneq_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VFMSQ_LANEQ (FP16)"
-+  clean_results ();
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmsq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 8),
-+		       VECT_VAR (vsrc_3, float, 16, 8), 0);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected0_fms_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmsq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 8),
-+		       VECT_VAR (vsrc_3, float, 16, 8), 1);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected1_fms_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmsq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 8),
-+		       VECT_VAR (vsrc_3, float, 16, 8), 2);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected2_fms_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmsq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 8),
-+		       VECT_VAR (vsrc_3, float, 16, 8), 3);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected3_fms_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmsq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 8),
-+		       VECT_VAR (vsrc_3, float, 16, 8), 4);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected4_fms_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmsq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 8),
-+		       VECT_VAR (vsrc_3, float, 16, 8), 5);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected5_fms_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmsq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 8),
-+		       VECT_VAR (vsrc_3, float, 16, 8), 6);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected6_fms_laneq_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmsq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 8),
-+		       VECT_VAR (vsrc_3, float, 16, 8), 7);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected7_fms_laneq_static, "");
-+}
-+
-+int
-+main (void)
-+{
-+  exec_vfmas_lane_f16 ();
-+  return 0;
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfmas_n_f16_1.c
-@@ -0,0 +1,469 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_neon } */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+#define FP16_C(a) ((__fp16) a)
-+#define A0 FP16_C (123.4)
-+#define A1 FP16_C (-5.8)
-+#define A2 FP16_C (-0.0)
-+#define A3 FP16_C (10)
-+#define A4 FP16_C (123412.43)
-+#define A5 FP16_C (-5.8)
-+#define A6 FP16_C (90.8)
-+#define A7 FP16_C (24)
-+
-+#define B0 FP16_C (23.4)
-+#define B1 FP16_C (-5.8)
-+#define B2 FP16_C (8.9)
-+#define B3 FP16_C (4.0)
-+#define B4 FP16_C (3.4)
-+#define B5 FP16_C (-550.8)
-+#define B6 FP16_C (-31.8)
-+#define B7 FP16_C (20000.0)
-+
-+/* Expected results for vfma_n.  */
-+VECT_VAR_DECL (expected_fma0_static, hfloat, 16, 4) []
-+  = { 0x613E /* A0 + B0 * B0.  */,
-+      0xD86D /* A1 + B1 * B0.  */,
-+      0x5A82 /* A2 + B2 * B0.  */,
-+      0x567A /* A3 + B3 * B0.  */ };
-+
-+VECT_VAR_DECL (expected_fma1_static, hfloat, 16, 4) []
-+  = { 0xCA33 /* A0 + B0 * B1.  */,
-+      0x4EF6 /* A1 + B1 * B1.  */,
-+      0xD274 /* A2 + B2 * B1.  */,
-+      0xCA9A /* A3 + B3 * B1.  */ };
-+
-+VECT_VAR_DECL (expected_fma2_static, hfloat, 16, 4) []
-+  = { 0x5D2F /* A0 + B0 * B2.  */,
-+      0xD32D /* A1 + B1 * B2.  */,
-+      0x54F3 /* A2 + B2 * B2.  */,
-+      0x51B3 /* A3 + B3 * B2.  */ };
-+
-+VECT_VAR_DECL (expected_fma3_static, hfloat, 16, 4) []
-+  = { 0x5AC8 /* A0 + B0 * B3.  */,
-+      0xCF40 /* A1 + B1 * B3.  */,
-+      0x5073 /* A2 + B2 * B3.  */,
-+      0x4E80 /* A3 + B3 * B3.  */ };
-+
-+VECT_VAR_DECL (expected_fma0_static, hfloat, 16, 8) []
-+  = { 0x613E /* A0 + B0 * B0.  */,
-+      0xD86D /* A1 + B1 * B0.  */,
-+      0x5A82 /* A2 + B2 * B0.  */,
-+      0x567A /* A3 + B3 * B0.  */,
-+      0x7C00 /* A4 + B4 * B0.  */,
-+      0xF24D /* A5 + B5 * B0.  */,
-+      0xE11B /* A6 + B6 * B0.  */,
-+      0x7C00 /* A7 + B7 * B0.  */ };
-+
-+VECT_VAR_DECL (expected_fma1_static, hfloat, 16, 8) []
-+  = { 0xCA33 /* A0 + B0 * B1.  */,
-+      0x4EF6 /* A1 + B1 * B1.  */,
-+      0xD274 /* A2 + B2 * B1.  */,
-+      0xCA9A /* A3 + B3 * B1.  */,
-+      0x7C00 /* A4 + B4 * B1.  */,
-+      0x6A3B /* A5 + B5 * B1.  */,
-+      0x5C4D /* A6 + B6 * B1.  */,
-+      0xFC00 /* A7 + B7 * B1.  */ };
-+
-+VECT_VAR_DECL (expected_fma2_static, hfloat, 16, 8) []
-+  = { 0x5D2F /* A0 + B0 * B2.  */,
-+      0xD32D /* A1 + B1 * B2.  */,
-+      0x54F3 /* A2 + B2 * B2.  */,
-+      0x51B3 /* A3 + B3 * B2.  */,
-+      0x7C00 /* A4 + B4 * B2.  */,
-+      0xECCB /* A5 + B5 * B2.  */,
-+      0xDA01 /* A6 + B6 * B2.  */,
-+      0x7C00 /* A7 + B7 * B2.  */ };
-+
-+VECT_VAR_DECL (expected_fma3_static, hfloat, 16, 8) []
-+  = { 0x5AC8 /* A0 + B0 * B3.  */,
-+      0xCF40 /* A1 + B1 * B3.  */,
-+      0x5073 /* A2 + B2 * B3.  */,
-+      0x4E80 /* A3 + B3 * B3.  */,
-+      0x7C00 /* A4 + B4 * B3.  */,
-+      0xE851 /* A5 + B5 * B3.  */,
-+      0xD08C /* A6 + B6 * B3.  */,
-+      0x7C00 /* A7 + B7 * B3.  */ };
-+
-+VECT_VAR_DECL (expected_fma4_static, hfloat, 16, 8) []
-+  = { 0x5A58 /* A0 + B0 * B4.  */,
-+      0xCE62 /* A1 + B1 * B4.  */,
-+      0x4F91 /* A2 + B2 * B4.  */,
-+      0x4DE6 /* A3 + B3 * B4.  */,
-+      0x7C00 /* A4 + B4 * B4.  */,
-+      0xE757 /* A5 + B5 * B4.  */,
-+      0xCC54 /* A6 + B6 * B4.  */,
-+      0x7C00 /* A7 + B7 * B4.  */ };
-+
-+VECT_VAR_DECL (expected_fma5_static, hfloat, 16, 8) []
-+  = { 0xF23D /* A0 + B0 * B5.  */,
-+      0x6A3B /* A1 + B1 * B5.  */,
-+      0xECCA /* A2 + B2 * B5.  */,
-+      0xE849 /* A3 + B3 * B5.  */,
-+      0x7C00 /* A4 + B4 * B5.  */,
-+      0x7C00 /* A5 + B5 * B5.  */,
-+      0x744D /* A6 + B6 * B5.  */,
-+      0xFC00 /* A7 + B7 * B5.  */ };
-+
-+VECT_VAR_DECL (expected_fma6_static, hfloat, 16, 8) []
-+  = { 0xE0DA /* A0 + B0 * B6.  */,
-+      0x5995 /* A1 + B1 * B6.  */,
-+      0xDC6C /* A2 + B2 * B6.  */,
-+      0xD753 /* A3 + B3 * B6.  */,
-+      0x7C00 /* A4 + B4 * B6.  */,
-+      0x7447 /* A5 + B5 * B6.  */,
-+      0x644E /* A6 + B6 * B6.  */,
-+      0xFC00 /* A7 + B7 * B6.  */ };
-+
-+VECT_VAR_DECL (expected_fma7_static, hfloat, 16, 8) []
-+  = { 0x7C00 /* A0 + B0 * B7.  */,
-+      0xFC00 /* A1 + B1 * B7.  */,
-+      0x7C00 /* A2 + B2 * B7.  */,
-+      0x7C00 /* A3 + B3 * B7.  */,
-+      0x7C00 /* A4 + B4 * B7.  */,
-+      0xFC00 /* A5 + B5 * B7.  */,
-+      0xFC00 /* A6 + B6 * B7.  */,
-+      0x7C00 /* A7 + B7 * B7.  */ };
-+
-+/* Expected results for vfms_n.  */
-+VECT_VAR_DECL (expected_fms0_static, hfloat, 16, 4) []
-+  = { 0xDEA2 /* A0 + (-B0) * B0.  */,
-+      0x5810 /* A1 + (-B1) * B0.  */,
-+      0xDA82 /* A2 + (-B2) * B0.  */,
-+      0xD53A /* A3 + (-B3) * B0.  */ };
-+
-+VECT_VAR_DECL (expected_fms1_static, hfloat, 16, 4) []
-+  = { 0x5C0D /* A0 + (-B0) * B1.  */,
-+      0xD0EE /* A1 + (-B1) * B1.  */,
-+      0x5274 /* A2 + (-B2) * B1.  */,
-+      0x5026 /* A3 + (-B3) * B1.  */ };
-+
-+VECT_VAR_DECL (expected_fms2_static, hfloat, 16, 4) []
-+  = { 0xD54E /* A0 + (-B0) * B2.  */,
-+      0x51BA /* A1 + (-B1) * B2.  */,
-+      0xD4F3 /* A2 + (-B2) * B2.  */,
-+      0xCE66 /* A3 + (-B3) * B2.  */ };
-+
-+VECT_VAR_DECL (expected_fms3_static, hfloat, 16, 4) []
-+  = { 0x4F70 /* A0 + (-B0) * B3.  */,
-+      0x4C5A /* A1 + (-B1) * B3.  */,
-+      0xD073 /* A2 + (-B2) * B3.  */,
-+      0xC600 /* A3 + (-B3) * B3.  */ };
-+
-+VECT_VAR_DECL (expected_fms0_static, hfloat, 16, 8) []
-+  = { 0xDEA2 /* A0 + (-B0) * B0.  */,
-+      0x5810 /* A1 + (-B1) * B0.  */,
-+      0xDA82 /* A2 + (-B2) * B0.  */,
-+      0xD53A /* A3 + (-B3) * B0.  */,
-+      0x7C00 /* A4 + (-B4) * B0.  */,
-+      0x724B /* A5 + (-B5) * B0.  */,
-+      0x6286 /* A6 + (-B6) * B0.  */,
-+      0xFC00 /* A7 + (-B7) * B0.  */ };
-+
-+VECT_VAR_DECL (expected_fms1_static, hfloat, 16, 8) []
-+  = { 0x5C0D /* A0 + (-B0) * B1.  */,
-+      0xD0EE /* A1 + (-B1) * B1.  */,
-+      0x5274 /* A2 + (-B2) * B1.  */,
-+      0x5026 /* A3 + (-B3) * B1.  */,
-+      0x7C00 /* A4 + (-B4) * B1.  */,
-+      0xEA41 /* A5 + (-B5) * B1.  */,
-+      0xD5DA /* A6 + (-B6) * B1.  */,
-+      0x7C00 /* A7 + (-B7) * B1.  */ };
-+
-+VECT_VAR_DECL (expected_fms2_static, hfloat, 16, 8) []
-+  = { 0xD54E /* A0 + (-B0) * B2.  */,
-+      0x51BA /* A1 + (-B1) * B2.  */,
-+      0xD4F3 /* A2 + (-B2) * B2.  */,
-+      0xCE66 /* A3 + (-B3) * B2.  */,
-+      0x7C00 /* A4 + (-B4) * B2.  */,
-+      0x6CC8 /* A5 + (-B5) * B2.  */,
-+      0x5DD7 /* A6 + (-B6) * B2.  */,
-+      0xFC00 /* A7 + (-B7) * B2.  */ };
-+
-+VECT_VAR_DECL (expected_fms3_static, hfloat, 16, 8) []
-+  = { 0x4F70 /* A0 + (-B0) * B3.  */,
-+      0x4C5A /* A1 + (-B1) * B3.  */,
-+      0xD073 /* A2 + (-B2) * B3.  */,
-+      0xC600 /* A3 + (-B3) * B3.  */,
-+      0x7C00 /* A4 + (-B4) * B3.  */,
-+      0x684B /* A5 + (-B5) * B3.  */,
-+      0x5AD0 /* A6 + (-B6) * B3.  */,
-+      0xFC00 /* A7 + (-B7) * B3.  */ };
-+
-+VECT_VAR_DECL (expected_fms4_static, hfloat, 16, 8) []
-+  = { 0x5179 /* A0 + (-B0) * B4.  */,
-+      0x4AF6 /* A1 + (-B1) * B4.  */,
-+      0xCF91 /* A2 + (-B2) * B4.  */,
-+      0xC334 /* A3 + (-B3) * B4.  */,
-+      0x7C00 /* A4 + (-B4) * B4.  */,
-+      0x674C /* A5 + (-B5) * B4.  */,
-+      0x5A37 /* A6 + (-B6) * B4.  */,
-+      0xFC00 /* A7 + (-B7) * B4.  */ };
-+
-+VECT_VAR_DECL (expected_fms5_static, hfloat, 16, 8) []
-+  = { 0x725C /* A0 + (-B0) * B5.  */,
-+      0xEA41 /* A1 + (-B1) * B5.  */,
-+      0x6CCA /* A2 + (-B2) * B5.  */,
-+      0x6853 /* A3 + (-B3) * B5.  */,
-+      0x7C00 /* A4 + (-B4) * B5.  */,
-+      0xFC00 /* A5 + (-B5) * B5.  */,
-+      0xF441 /* A6 + (-B6) * B5.  */,
-+      0x7C00 /* A7 + (-B7) * B5.  */ };
-+
-+VECT_VAR_DECL (expected_fms6_static, hfloat, 16, 8) []
-+  = { 0x62C7 /* A0 + (-B0) * B6.  */,
-+      0xD9F2 /* A1 + (-B1) * B6.  */,
-+      0x5C6C /* A2 + (-B2) * B6.  */,
-+      0x584A /* A3 + (-B3) * B6.  */,
-+      0x7C00 /* A4 + (-B4) * B6.  */,
-+      0xF447 /* A5 + (-B5) * B6.  */,
-+      0xE330 /* A6 + (-B6) * B6.  */,
-+      0x7C00 /* A7 + (-B7) * B6.  */ };
-+
-+VECT_VAR_DECL (expected_fms7_static, hfloat, 16, 8) []
-+  = { 0xFC00 /* A0 + (-B0) * B7.  */,
-+      0x7C00 /* A1 + (-B1) * B7.  */,
-+      0xFC00 /* A2 + (-B2) * B7.  */,
-+      0xFC00 /* A3 + (-B3) * B7.  */,
-+      0x7C00 /* A4 + (-B4) * B7.  */,
-+      0x7C00 /* A5 + (-B5) * B7.  */,
-+      0x7C00 /* A6 + (-B6) * B7.  */,
-+      0xFC00 /* A7 + (-B7) * B7.  */ };
-+
-+void exec_vfmas_n_f16 (void)
-+{
-+#undef TEST_MSG
-+#define TEST_MSG "VFMA_N (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc_1, float, 16, 4);
-+  DECL_VARIABLE(vsrc_2, float, 16, 4);
-+  VECT_VAR_DECL (buf_src_1, float, 16, 4) [] = {A0, A1, A2, A3};
-+  VECT_VAR_DECL (buf_src_2, float, 16, 4) [] = {B0, B1, B2, B3};
-+  VLOAD (vsrc_1, buf_src_1, , float, f, 16, 4);
-+  VLOAD (vsrc_2, buf_src_2, , float, f, 16, 4);
-+  DECL_VARIABLE (vector_res, float, 16, 4)
-+    = vfma_n_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		  VECT_VAR (vsrc_2, float, 16, 4), B0);
-+
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_fma0_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfma_n_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		  VECT_VAR (vsrc_2, float, 16, 4), B1);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_fma1_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfma_n_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		  VECT_VAR (vsrc_2, float, 16, 4), B2);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_fma2_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfma_n_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		  VECT_VAR (vsrc_2, float, 16, 4), B3);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_fma3_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VFMAQ_N (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc_1, float, 16, 8);
-+  DECL_VARIABLE(vsrc_2, float, 16, 8);
-+  VECT_VAR_DECL (buf_src_1, float, 16, 8) [] = {A0, A1, A2, A3, A4, A5, A6, A7};
-+  VECT_VAR_DECL (buf_src_2, float, 16, 8) [] = {B0, B1, B2, B3, B4, B5, B6, B7};
-+  VLOAD (vsrc_1, buf_src_1, q, float, f, 16, 8);
-+  VLOAD (vsrc_2, buf_src_2, q, float, f, 16, 8);
-+  DECL_VARIABLE (vector_res, float, 16, 8)
-+    = vfmaq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		   VECT_VAR (vsrc_2, float, 16, 8), B0);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fma0_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmaq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		   VECT_VAR (vsrc_2, float, 16, 8), B1);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fma1_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmaq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		   VECT_VAR (vsrc_2, float, 16, 8), B2);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fma2_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmaq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		   VECT_VAR (vsrc_2, float, 16, 8), B3);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fma3_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmaq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		   VECT_VAR (vsrc_2, float, 16, 8), B4);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fma4_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmaq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		   VECT_VAR (vsrc_2, float, 16, 8), B5);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fma5_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmaq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		   VECT_VAR (vsrc_2, float, 16, 8), B6);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fma6_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmaq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		   VECT_VAR (vsrc_2, float, 16, 8), B7);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fma7_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VFMA_N (FP16)"
-+  clean_results ();
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfms_n_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		  VECT_VAR (vsrc_2, float, 16, 4), B0);
-+
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_fms0_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfms_n_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		  VECT_VAR (vsrc_2, float, 16, 4), B1);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_fms1_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfms_n_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		  VECT_VAR (vsrc_2, float, 16, 4), B2);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_fms2_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vfms_n_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		  VECT_VAR (vsrc_2, float, 16, 4), B3);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_fms3_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VFMAQ_N (FP16)"
-+  clean_results ();
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmsq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		   VECT_VAR (vsrc_2, float, 16, 8), B0);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fms0_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmsq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		   VECT_VAR (vsrc_2, float, 16, 8), B1);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fms1_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmsq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		   VECT_VAR (vsrc_2, float, 16, 8), B2);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fms2_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmsq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		   VECT_VAR (vsrc_2, float, 16, 8), B3);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fms3_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmsq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		   VECT_VAR (vsrc_2, float, 16, 8), B4);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fms4_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmsq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		   VECT_VAR (vsrc_2, float, 16, 8), B5);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fms5_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmsq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		   VECT_VAR (vsrc_2, float, 16, 8), B6);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fms6_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vfmsq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		   VECT_VAR (vsrc_2, float, 16, 8), B7);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_fms7_static, "");
-+}
-+
-+int
-+main (void)
-+{
-+  exec_vfmas_n_f16 ();
-+  return 0;
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfmash_lane_f16_1.c
-@@ -0,0 +1,143 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_neon } */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+#define FP16_C(a) ((__fp16) a)
-+#define A0 FP16_C (123.4)
-+#define B0 FP16_C (-5.8)
-+#define C0 FP16_C (-3.8)
-+#define D0 FP16_C (10)
-+
-+#define A1 FP16_C (12.4)
-+#define B1 FP16_C (-5.8)
-+#define C1 FP16_C (90.8)
-+#define D1 FP16_C (24)
-+
-+#define A2 FP16_C (23.4)
-+#define B2 FP16_C (-5.8)
-+#define C2 FP16_C (8.9)
-+#define D2 FP16_C (4)
-+
-+#define E0 FP16_C (3.4)
-+#define F0 FP16_C (-55.8)
-+#define G0 FP16_C (-31.8)
-+#define H0 FP16_C (2)
-+
-+#define E1 FP16_C (123.4)
-+#define F1 FP16_C (-5.8)
-+#define G1 FP16_C (-3.8)
-+#define H1 FP16_C (102)
-+
-+#define E2 FP16_C (4.9)
-+#define F2 FP16_C (-15.8)
-+#define G2 FP16_C (39.8)
-+#define H2 FP16_C (49)
-+
-+extern void abort ();
-+
-+float16_t src1[8] = { A0, B0, C0, D0, E0, F0, G0, H0 };
-+float16_t src2[8] = { A1, B1, C1, D1, E1, F1, G1, H1 };
-+VECT_VAR_DECL (src3, float, 16, 4) [] = { A2, B2, C2, D2 };
-+VECT_VAR_DECL (src3, float, 16, 8) [] = { A2, B2, C2, D2, E2, F2, G2, H2 };
-+
-+/* Expected results for vfmah_lane_f16.  */
-+uint16_t expected[4] = { 0x5E76 /* A0 + A1 * A2.  */,
-+			 0x4EF6 /* B0 + B1 * B2.  */,
-+			 0x6249 /* C0 + C1 * C2.  */,
-+			 0x56A0 /* D0 + D1 * D2.  */ };
-+
-+/* Expected results for vfmah_laneq_f16.  */
-+uint16_t expected_laneq[8] = { 0x5E76 /* A0 + A1 * A2.  */,
-+			       0x4EF6 /* B0 + B1 * B2.  */,
-+			       0x6249 /* C0 + C1 * C2.  */,
-+			       0x56A0 /* D0 + D1 * D2.  */,
-+			       0x60BF /* E0 + E1 * E2.  */,
-+			       0x507A /* F0 + F1 * F2.  */,
-+			       0xD9B9 /* G0 + G1 * G2.  */,
-+			       0x6CE2 /* H0 + H1 * H2.  */ };
-+
-+/* Expected results for vfmsh_lane_f16.  */
-+uint16_t expected_fms[4] = { 0xD937 /* A0 + -A1 * A2.  */,
-+			     0xD0EE /* B0 + -B1 * B2.  */,
-+			     0xE258 /* C0 + -C1 * C2.  */,
-+			     0xD560 /* D0 + -D1 * D2.  */ };
-+
-+/* Expected results for vfmsh_laneq_f16.  */
-+uint16_t expected_fms_laneq[8] = { 0xD937 /* A0 + -A1 * A2.  */,
-+				   0xD0EE /* B0 + -B1 * B2.  */,
-+				   0xE258 /* C0 + -C1 * C2.  */,
-+				   0xD560 /* D0 + -D1 * D2.  */,
-+				   0xE0B2 /* E0 + -E1 * E2.  */,
-+				   0xD89C /* F0 + -F1 * F2.  */,
-+				   0x5778 /* G0 + -G1 * G2.  */,
-+				   0xECE1 /* H0 + -H1 * H2.  */ };
-+
-+void exec_vfmash_lane_f16 (void)
-+{
-+#define CHECK_LANE(N) \
-+  ret = vfmah_lane_f16 (src1[N], src2[N], VECT_VAR (vsrc3, float, 16, 4), N);\
-+  if (*(uint16_t *) &ret != expected[N])\
-+    abort ();
-+
-+  DECL_VARIABLE(vsrc3, float, 16, 4);
-+  VLOAD (vsrc3, src3, , float, f, 16, 4);
-+  float16_t ret;
-+  CHECK_LANE(0)
-+  CHECK_LANE(1)
-+  CHECK_LANE(2)
-+  CHECK_LANE(3)
-+
-+#undef CHECK_LANE
-+#define CHECK_LANE(N) \
-+  ret = vfmah_laneq_f16 (src1[N], src2[N], VECT_VAR (vsrc3, float, 16, 8), N);\
-+  if (*(uint16_t *) &ret != expected_laneq[N]) \
-+	  abort ();
-+
-+  DECL_VARIABLE(vsrc3, float, 16, 8);
-+  VLOAD (vsrc3, src3, q, float, f, 16, 8);
-+  CHECK_LANE(0)
-+  CHECK_LANE(1)
-+  CHECK_LANE(2)
-+  CHECK_LANE(3)
-+  CHECK_LANE(4)
-+  CHECK_LANE(5)
-+  CHECK_LANE(6)
-+  CHECK_LANE(7)
-+
-+#undef CHECK_LANE
-+#define CHECK_LANE(N) \
-+  ret = vfmsh_lane_f16 (src1[N], src2[N], VECT_VAR (vsrc3, float, 16, 4), N);\
-+  if (*(uint16_t *) &ret != expected_fms[N])\
-+    abort ();
-+
-+  CHECK_LANE(0)
-+  CHECK_LANE(1)
-+  CHECK_LANE(2)
-+
-+#undef CHECK_LANE
-+#define CHECK_LANE(N) \
-+  ret = vfmsh_laneq_f16 (src1[N], src2[N], VECT_VAR (vsrc3, float, 16, 8), N);\
-+  if (*(uint16_t *) &ret != expected_fms_laneq[N]) \
-+	  abort ();
-+
-+  CHECK_LANE(0)
-+  CHECK_LANE(1)
-+  CHECK_LANE(2)
-+  CHECK_LANE(3)
-+  CHECK_LANE(4)
-+  CHECK_LANE(5)
-+  CHECK_LANE(6)
-+  CHECK_LANE(7)
-+}
-+
-+int
-+main (void)
-+{
-+  exec_vfmash_lane_f16 ();
-+  return 0;
-+}
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms.c
-@@ -4,10 +4,17 @@
- 
- #ifdef __ARM_FEATURE_FMA
- /* Expected results.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xe206, 0xe204, 0xe202, 0xe200 };
-+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0xe455, 0xe454, 0xe453, 0xe452,
-+					      0xe451, 0xe450, 0xe44f, 0xe44e };
-+#endif
- VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc440ca3d, 0xc4408a3d };
--VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc48a9eb8, 0xc48a7eb8, 0xc48a5eb8, 0xc48a3eb8 };
-+VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc48a9eb8, 0xc48a7eb8,
-+					   0xc48a5eb8, 0xc48a3eb8 };
- #ifdef __aarch64__
--VECT_VAR_DECL(expected,hfloat,64,2) [] = { 0xc08a06e1532b8520, 0xc089fee1532b8520 };
-+VECT_VAR_DECL(expected,hfloat,64,2) [] = { 0xc08a06e1532b8520,
-+					   0xc089fee1532b8520 };
- #endif
- 
- #define TEST_MSG "VFMS/VFMSQ"
-@@ -44,6 +51,18 @@ void exec_vfms (void)
-   DECL_VARIABLE(VAR, float, 32, 4);
- #endif
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector1, float, 16, 4);
-+  DECL_VARIABLE(vector2, float, 16, 4);
-+  DECL_VARIABLE(vector3, float, 16, 4);
-+  DECL_VARIABLE(vector_res, float, 16, 4);
-+
-+  DECL_VARIABLE(vector1, float, 16, 8);
-+  DECL_VARIABLE(vector2, float, 16, 8);
-+  DECL_VARIABLE(vector3, float, 16, 8);
-+  DECL_VARIABLE(vector_res, float, 16, 8);
-+#endif
-+
-   DECL_VFMS_VAR(vector1);
-   DECL_VFMS_VAR(vector2);
-   DECL_VFMS_VAR(vector3);
-@@ -52,6 +71,10 @@ void exec_vfms (void)
-   clean_results ();
- 
-   /* Initialize input "vector1" from "buffer".  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VLOAD(vector1, buffer, , float, f, 16, 4);
-+  VLOAD(vector1, buffer, q, float, f, 16, 8);
-+#endif
-   VLOAD(vector1, buffer, , float, f, 32, 2);
-   VLOAD(vector1, buffer, q, float, f, 32, 4);
- #ifdef __aarch64__
-@@ -59,13 +82,21 @@ void exec_vfms (void)
- #endif
- 
-   /* Choose init value arbitrarily.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector2, , float, f, 16, 4, 9.3f);
-+  VDUP(vector2, q, float, f, 16, 8, 29.7f);
-+#endif
-   VDUP(vector2, , float, f, 32, 2, 9.3f);
-   VDUP(vector2, q, float, f, 32, 4, 29.7f);
- #ifdef __aarch64__
-   VDUP(vector2, q, float, f, 64, 2, 15.8f);
- #endif
--  
-+
-   /* Choose init value arbitrarily.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector3, , float, f, 16, 4, 81.2f);
-+  VDUP(vector3, q, float, f, 16, 8, 36.8f);
-+#endif
-   VDUP(vector3, , float, f, 32, 2, 81.2f);
-   VDUP(vector3, q, float, f, 32, 4, 36.8f);
- #ifdef __aarch64__
-@@ -73,12 +104,20 @@ void exec_vfms (void)
- #endif
- 
-   /* Execute the tests.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VFMS(, float, f, 16, 4);
-+  TEST_VFMS(q, float, f, 16, 8);
-+#endif
-   TEST_VFMS(, float, f, 32, 2);
-   TEST_VFMS(q, float, f, 32, 4);
- #ifdef __aarch64__
-   TEST_VFMS(q, float, f, 64, 2);
- #endif
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, "");
-+#endif
-   CHECK_VFMS_RESULTS (TEST_MSG, "");
- }
- #endif
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c
-@@ -0,0 +1,490 @@
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA)
-+
-+#define A0 123.4f
-+#define A1 -3.8f
-+#define A2 -29.4f
-+#define A3 (__builtin_inff ())
-+#define A4 0.0f
-+#define A5 24.0f
-+#define A6 124.0f
-+#define A7 1024.0f
-+
-+#define B0 -5.8f
-+#define B1 -0.0f
-+#define B2 -10.8f
-+#define B3 10.0f
-+#define B4 23.4f
-+#define B5 -1234.8f
-+#define B6 8.9f
-+#define B7 4.0f
-+
-+#define E0 9.8f
-+#define E1 -1024.0f
-+#define E2 (-__builtin_inff ())
-+#define E3 479.0f
-+float32_t elem0 = E0;
-+float32_t elem1 = E1;
-+float32_t elem2 = E2;
-+float32_t elem3 = E3;
-+
-+#define DA0 1231234.4
-+#define DA1 -3.8
-+#define DA2 -2980.4
-+#define DA3 -5.8
-+#define DA4 0.01123
-+#define DA5 24.0
-+#define DA6 124.12345
-+#define DA7 1024.0
-+
-+#define DB0 -5.8
-+#define DB1 (__builtin_inf ())
-+#define DB2 -105.8
-+#define DB3 10.0
-+#define DB4 (-__builtin_inf ())
-+#define DB5 -1234.8
-+#define DB6 848.9
-+#define DB7 44444.0
-+
-+#define DE0 9.8
-+#define DE1 -1024.0
-+#define DE2 105.8
-+#define DE3 479.0
-+float64_t delem0 = DE0;
-+float64_t delem1 = DE1;
-+float64_t delem2 = DE2;
-+float64_t delem3 = DE3;
-+
-+/* Expected results for vfms_n.  */
-+
-+VECT_VAR_DECL(expectedfms0, float, 32, 2) [] = {A0 + -B0 * E0, A1 + -B1 * E0};
-+VECT_VAR_DECL(expectedfms1, float, 32, 2) [] = {A2 + -B2 * E1, A3 + -B3 * E1};
-+VECT_VAR_DECL(expectedfms2, float, 32, 2) [] = {A4 + -B4 * E2, A5 + -B5 * E2};
-+VECT_VAR_DECL(expectedfms3, float, 32, 2) [] = {A6 + -B6 * E3, A7 + -B7 * E3};
-+VECT_VAR_DECL(expectedfma0, float, 32, 2) [] = {A0 + B0 * E0, A1 + B1 * E0};
-+VECT_VAR_DECL(expectedfma1, float, 32, 2) [] = {A2 + B2 * E1, A3 + B3 * E1};
-+VECT_VAR_DECL(expectedfma2, float, 32, 2) [] = {A4 + B4 * E2, A5 + B5 * E2};
-+VECT_VAR_DECL(expectedfma3, float, 32, 2) [] = {A6 + B6 * E3, A7 + B7 * E3};
-+
-+hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 2) =
-+  (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 2);
-+hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 2) =
-+  (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 2);
-+hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 2) =
-+  (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 2);
-+hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 2) =
-+  (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 2);
-+hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 2) =
-+  (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 2);
-+hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 2) =
-+  (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 2);
-+hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 2) =
-+  (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 2);
-+hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 2) =
-+  (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 2);
-+
-+
-+VECT_VAR_DECL(expectedfms0, float, 32, 4) [] = {A0 + -B0 * E0, A1 + -B1 * E0,
-+						A2 + -B2 * E0, A3 + -B3 * E0};
-+VECT_VAR_DECL(expectedfms1, float, 32, 4) [] = {A4 + -B4 * E1, A5 + -B5 * E1,
-+						A6 + -B6 * E1, A7 + -B7 * E1};
-+VECT_VAR_DECL(expectedfms2, float, 32, 4) [] = {A0 + -B0 * E2, A2 + -B2 * E2,
-+						A4 + -B4 * E2, A6 + -B6 * E2};
-+VECT_VAR_DECL(expectedfms3, float, 32, 4) [] = {A1 + -B1 * E3, A3 + -B3 * E3,
-+						A5 + -B5 * E3, A7 + -B7 * E3};
-+VECT_VAR_DECL(expectedfma0, float, 32, 4) [] = {A0 + B0 * E0, A1 + B1 * E0,
-+						A2 + B2 * E0, A3 + B3 * E0};
-+VECT_VAR_DECL(expectedfma1, float, 32, 4) [] = {A4 + B4 * E1, A5 + B5 * E1,
-+						A6 + B6 * E1, A7 + B7 * E1};
-+VECT_VAR_DECL(expectedfma2, float, 32, 4) [] = {A0 + B0 * E2, A2 + B2 * E2,
-+						A4 + B4 * E2, A6 + B6 * E2};
-+VECT_VAR_DECL(expectedfma3, float, 32, 4) [] = {A1 + B1 * E3, A3 + B3 * E3,
-+						A5 + B5 * E3, A7 + B7 * E3};
-+
-+hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 4) =
-+  (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 4);
-+hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 4) =
-+  (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 4);
-+hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 4) =
-+  (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 4);
-+hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 4) =
-+  (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 4);
-+hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 4) =
-+  (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 4);
-+hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 4) =
-+  (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 4);
-+hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 4) =
-+  (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 4);
-+hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 4) =
-+  (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 4);
-+
-+VECT_VAR_DECL(expectedfms0, float, 64, 2) [] = {DA0 + -DB0 * DE0,
-+						DA1 + -DB1 * DE0};
-+VECT_VAR_DECL(expectedfms1, float, 64, 2) [] = {DA2 + -DB2 * DE1,
-+						DA3 + -DB3 * DE1};
-+VECT_VAR_DECL(expectedfms2, float, 64, 2) [] = {DA4 + -DB4 * DE2,
-+						DA5 + -DB5 * DE2};
-+VECT_VAR_DECL(expectedfms3, float, 64, 2) [] = {DA6 + -DB6 * DE3,
-+						DA7 + -DB7 * DE3};
-+VECT_VAR_DECL(expectedfma0, float, 64, 2) [] = {DA0 + DB0 * DE0,
-+						DA1 + DB1 * DE0};
-+VECT_VAR_DECL(expectedfma1, float, 64, 2) [] = {DA2 + DB2 * DE1,
-+						DA3 + DB3 * DE1};
-+VECT_VAR_DECL(expectedfma2, float, 64, 2) [] = {DA4 + DB4 * DE2,
-+						DA5 + DB5 * DE2};
-+VECT_VAR_DECL(expectedfma3, float, 64, 2) [] = {DA6 + DB6 * DE3,
-+						DA7 + DB7 * DE3};
-+hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 2) =
-+  (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 2);
-+hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 2) =
-+  (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 2);
-+hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 2) =
-+  (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 2);
-+hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 2) =
-+  (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 2);
-+hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 2) =
-+  (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 2);
-+hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 2) =
-+  (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 2);
-+hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 2) =
-+  (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 2);
-+hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 2) =
-+  (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 2);
-+
-+VECT_VAR_DECL(expectedfms0, float, 64, 1) [] = {DA0 + -DB0 * DE0};
-+VECT_VAR_DECL(expectedfms1, float, 64, 1) [] = {DA2 + -DB2 * DE1};
-+VECT_VAR_DECL(expectedfms2, float, 64, 1) [] = {DA4 + -DB4 * DE2};
-+VECT_VAR_DECL(expectedfms3, float, 64, 1) [] = {DA6 + -DB6 * DE3};
-+VECT_VAR_DECL(expectedfma0, float, 64, 1) [] = {DA0 + DB0 * DE0};
-+VECT_VAR_DECL(expectedfma1, float, 64, 1) [] = {DA2 + DB2 * DE1};
-+VECT_VAR_DECL(expectedfma2, float, 64, 1) [] = {DA4 + DB4 * DE2};
-+VECT_VAR_DECL(expectedfma3, float, 64, 1) [] = {DA6 + DB6 * DE3};
-+
-+hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 1) =
-+  (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 1);
-+hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 1) =
-+  (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 1);
-+hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 1) =
-+  (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 1);
-+hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 1) =
-+  (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 1);
-+hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 1) =
-+  (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 1);
-+hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 1) =
-+  (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 1);
-+hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 1) =
-+  (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 1);
-+hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 1) =
-+  (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 1);
-+
-+void exec_vfma_vfms_n (void)
-+{
-+#undef TEST_MSG
-+#define TEST_MSG "VFMS_VFMA_N (FP32)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc_1, float, 32, 2);
-+  DECL_VARIABLE(vsrc_2, float, 32, 2);
-+  VECT_VAR_DECL (buf_src_1, float, 32, 2) [] = {A0, A1};
-+  VECT_VAR_DECL (buf_src_2, float, 32, 2) [] = {B0, B1};
-+  VLOAD (vsrc_1, buf_src_1, , float, f, 32, 2);
-+  VLOAD (vsrc_2, buf_src_2, , float, f, 32, 2);
-+  DECL_VARIABLE (vector_res, float, 32, 2) =
-+    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
-+		VECT_VAR (vsrc_2, float, 32, 2), elem0);
-+  vst1_f32 (VECT_VAR (result, float, 32, 2),
-+	    VECT_VAR (vector_res, float, 32, 2));
-+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms0_static, "");
-+  VECT_VAR (vector_res, float, 32, 2) =
-+    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
-+		VECT_VAR (vsrc_2, float, 32, 2), elem0);
-+  vst1_f32 (VECT_VAR (result, float, 32, 2),
-+	    VECT_VAR (vector_res, float, 32, 2));
-+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma0_static, "");
-+
-+  VECT_VAR_DECL (buf_src_3, float, 32, 2) [] = {A2, A3};
-+  VECT_VAR_DECL (buf_src_4, float, 32, 2) [] = {B2, B3};
-+  VLOAD (vsrc_1, buf_src_3, , float, f, 32, 2);
-+  VLOAD (vsrc_2, buf_src_4, , float, f, 32, 2);
-+  VECT_VAR (vector_res, float, 32, 2) =
-+    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
-+		VECT_VAR (vsrc_2, float, 32, 2), elem1);
-+  vst1_f32 (VECT_VAR (result, float, 32, 2),
-+	    VECT_VAR (vector_res, float, 32, 2));
-+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms1_static, "");
-+  VECT_VAR (vector_res, float, 32, 2) =
-+    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
-+		VECT_VAR (vsrc_2, float, 32, 2), elem1);
-+  vst1_f32 (VECT_VAR (result, float, 32, 2),
-+	    VECT_VAR (vector_res, float, 32, 2));
-+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma1_static, "");
-+
-+  VECT_VAR_DECL (buf_src_5, float, 32, 2) [] = {A4, A5};
-+  VECT_VAR_DECL (buf_src_6, float, 32, 2) [] = {B4, B5};
-+  VLOAD (vsrc_1, buf_src_5, , float, f, 32, 2);
-+  VLOAD (vsrc_2, buf_src_6, , float, f, 32, 2);
-+  VECT_VAR (vector_res, float, 32, 2) =
-+    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
-+		VECT_VAR (vsrc_2, float, 32, 2), elem2);
-+  vst1_f32 (VECT_VAR (result, float, 32, 2),
-+	    VECT_VAR (vector_res, float, 32, 2));
-+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms2_static, "");
-+  VECT_VAR (vector_res, float, 32, 2) =
-+    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
-+		VECT_VAR (vsrc_2, float, 32, 2), elem2);
-+  vst1_f32 (VECT_VAR (result, float, 32, 2),
-+	    VECT_VAR (vector_res, float, 32, 2));
-+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma2_static, "");
-+
-+  VECT_VAR_DECL (buf_src_7, float, 32, 2) [] = {A6, A7};
-+  VECT_VAR_DECL (buf_src_8, float, 32, 2) [] = {B6, B7};
-+  VLOAD (vsrc_1, buf_src_7, , float, f, 32, 2);
-+  VLOAD (vsrc_2, buf_src_8, , float, f, 32, 2);
-+  VECT_VAR (vector_res, float, 32, 2) =
-+    vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
-+		VECT_VAR (vsrc_2, float, 32, 2), elem3);
-+  vst1_f32 (VECT_VAR (result, float, 32, 2),
-+	    VECT_VAR (vector_res, float, 32, 2));
-+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms3_static, "");
-+  VECT_VAR (vector_res, float, 32, 2) =
-+    vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2),
-+		VECT_VAR (vsrc_2, float, 32, 2), elem3);
-+  vst1_f32 (VECT_VAR (result, float, 32, 2),
-+	    VECT_VAR (vector_res, float, 32, 2));
-+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma3_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VFMSQ_VFMAQ_N (FP32)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc_1, float, 32, 4);
-+  DECL_VARIABLE(vsrc_2, float, 32, 4);
-+  VECT_VAR_DECL (buf_src_1, float, 32, 4) [] = {A0, A1, A2, A3};
-+  VECT_VAR_DECL (buf_src_2, float, 32, 4) [] = {B0, B1, B2, B3};
-+  VLOAD (vsrc_1, buf_src_1, q, float, f, 32, 4);
-+  VLOAD (vsrc_2, buf_src_2, q, float, f, 32, 4);
-+  DECL_VARIABLE (vector_res, float, 32, 4) =
-+    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
-+		 VECT_VAR (vsrc_2, float, 32, 4), elem0);
-+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
-+	     VECT_VAR (vector_res, float, 32, 4));
-+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms0_static, "");
-+  VECT_VAR (vector_res, float, 32, 4) =
-+    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
-+		 VECT_VAR (vsrc_2, float, 32, 4), elem0);
-+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
-+	     VECT_VAR (vector_res, float, 32, 4));
-+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma0_static, "");
-+
-+  VECT_VAR_DECL (buf_src_3, float, 32, 4) [] = {A4, A5, A6, A7};
-+  VECT_VAR_DECL (buf_src_4, float, 32, 4) [] = {B4, B5, B6, B7};
-+  VLOAD (vsrc_1, buf_src_3, q, float, f, 32, 4);
-+  VLOAD (vsrc_2, buf_src_4, q, float, f, 32, 4);
-+  VECT_VAR (vector_res, float, 32, 4) =
-+    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
-+		 VECT_VAR (vsrc_2, float, 32, 4), elem1);
-+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
-+	     VECT_VAR (vector_res, float, 32, 4));
-+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms1_static, "");
-+  VECT_VAR (vector_res, float, 32, 4) =
-+    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
-+		 VECT_VAR (vsrc_2, float, 32, 4), elem1);
-+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
-+	     VECT_VAR (vector_res, float, 32, 4));
-+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma1_static, "");
-+
-+  VECT_VAR_DECL (buf_src_5, float, 32, 4) [] = {A0, A2, A4, A6};
-+  VECT_VAR_DECL (buf_src_6, float, 32, 4) [] = {B0, B2, B4, B6};
-+  VLOAD (vsrc_1, buf_src_5, q, float, f, 32, 4);
-+  VLOAD (vsrc_2, buf_src_6, q, float, f, 32, 4);
-+  VECT_VAR (vector_res, float, 32, 4) =
-+    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
-+		 VECT_VAR (vsrc_2, float, 32, 4), elem2);
-+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
-+	     VECT_VAR (vector_res, float, 32, 4));
-+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms2_static, "");
-+  VECT_VAR (vector_res, float, 32, 4) =
-+    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
-+		 VECT_VAR (vsrc_2, float, 32, 4), elem2);
-+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
-+	     VECT_VAR (vector_res, float, 32, 4));
-+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma2_static, "");
-+
-+  VECT_VAR_DECL (buf_src_7, float, 32, 4) [] = {A1, A3, A5, A7};
-+  VECT_VAR_DECL (buf_src_8, float, 32, 4) [] = {B1, B3, B5, B7};
-+  VLOAD (vsrc_1, buf_src_7, q, float, f, 32, 4);
-+  VLOAD (vsrc_2, buf_src_8, q, float, f, 32, 4);
-+  VECT_VAR (vector_res, float, 32, 4) =
-+    vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
-+		 VECT_VAR (vsrc_2, float, 32, 4), elem3);
-+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
-+	     VECT_VAR (vector_res, float, 32, 4));
-+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms3_static, "");
-+  VECT_VAR (vector_res, float, 32, 4) =
-+    vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4),
-+		 VECT_VAR (vsrc_2, float, 32, 4), elem3);
-+  vst1q_f32 (VECT_VAR (result, float, 32, 4),
-+	     VECT_VAR (vector_res, float, 32, 4));
-+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma3_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VFMSQ_VFMAQ_N (FP64)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc_1, float, 64, 2);
-+  DECL_VARIABLE(vsrc_2, float, 64, 2);
-+  VECT_VAR_DECL (buf_src_1, float, 64, 2) [] = {DA0, DA1};
-+  VECT_VAR_DECL (buf_src_2, float, 64, 2) [] = {DB0, DB1};
-+  VLOAD (vsrc_1, buf_src_1, q, float, f, 64, 2);
-+  VLOAD (vsrc_2, buf_src_2, q, float, f, 64, 2);
-+  DECL_VARIABLE (vector_res, float, 64, 2) =
-+    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
-+		 VECT_VAR (vsrc_2, float, 64, 2), delem0);
-+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
-+	     VECT_VAR (vector_res, float, 64, 2));
-+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expectedfms0_static, "");
-+  VECT_VAR (vector_res, float, 64, 2) =
-+    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
-+		 VECT_VAR (vsrc_2, float, 64, 2), delem0);
-+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
-+	     VECT_VAR (vector_res, float, 64, 2));
-+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expectedfma0_static, "");
-+
-+  VECT_VAR_DECL (buf_src_3, float, 64, 2) [] = {DA2, DA3};
-+  VECT_VAR_DECL (buf_src_4, float, 64, 2) [] = {DB2, DB3};
-+  VLOAD (vsrc_1, buf_src_3, q, float, f, 64, 2);
-+  VLOAD (vsrc_2, buf_src_4, q, float, f, 64, 2);
-+  VECT_VAR (vector_res, float, 64, 2) =
-+    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
-+		 VECT_VAR (vsrc_2, float, 64, 2), delem1);
-+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
-+	     VECT_VAR (vector_res, float, 64, 2));
-+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expectedfms1_static, "");
-+  VECT_VAR (vector_res, float, 64, 2) =
-+    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
-+		 VECT_VAR (vsrc_2, float, 64, 2), delem1);
-+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
-+	     VECT_VAR (vector_res, float, 64, 2));
-+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expectedfma1_static, "");
-+
-+  VECT_VAR_DECL (buf_src_5, float, 64, 2) [] = {DA4, DA5};
-+  VECT_VAR_DECL (buf_src_6, float, 64, 2) [] = {DB4, DB5};
-+  VLOAD (vsrc_1, buf_src_5, q, float, f, 64, 2);
-+  VLOAD (vsrc_2, buf_src_6, q, float, f, 64, 2);
-+  VECT_VAR (vector_res, float, 64, 2) =
-+    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
-+		 VECT_VAR (vsrc_2, float, 64, 2), delem2);
-+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
-+	     VECT_VAR (vector_res, float, 64, 2));
-+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expectedfms2_static, "");
-+  VECT_VAR (vector_res, float, 64, 2) =
-+    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
-+		 VECT_VAR (vsrc_2, float, 64, 2), delem2);
-+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
-+	     VECT_VAR (vector_res, float, 64, 2));
-+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expectedfma2_static, "");
-+
-+  VECT_VAR_DECL (buf_src_7, float, 64, 2) [] = {DA6, DA7};
-+  VECT_VAR_DECL (buf_src_8, float, 64, 2) [] = {DB6, DB7};
-+  VLOAD (vsrc_1, buf_src_7, q, float, f, 64, 2);
-+  VLOAD (vsrc_2, buf_src_8, q, float, f, 64, 2);
-+  VECT_VAR (vector_res, float, 64, 2) =
-+    vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
-+		 VECT_VAR (vsrc_2, float, 64, 2), delem3);
-+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
-+	     VECT_VAR (vector_res, float, 64, 2));
-+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expectedfms3_static, "");
-+  VECT_VAR (vector_res, float, 64, 2) =
-+    vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2),
-+		 VECT_VAR (vsrc_2, float, 64, 2), delem3);
-+  vst1q_f64 (VECT_VAR (result, float, 64, 2),
-+	     VECT_VAR (vector_res, float, 64, 2));
-+  CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expectedfma3_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VFMS_VFMA_N (FP64)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc_1, float, 64, 1);
-+  DECL_VARIABLE(vsrc_2, float, 64, 1);
-+  VECT_VAR_DECL (buf_src_1, float, 64, 1) [] = {DA0};
-+  VECT_VAR_DECL (buf_src_2, float, 64, 1) [] = {DB0};
-+  VLOAD (vsrc_1, buf_src_1, , float, f, 64, 1);
-+  VLOAD (vsrc_2, buf_src_2, , float, f, 64, 1);
-+  DECL_VARIABLE (vector_res, float, 64, 1) =
-+    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
-+		VECT_VAR (vsrc_2, float, 64, 1), delem0);
-+  vst1_f64 (VECT_VAR (result, float, 64, 1),
-+	     VECT_VAR (vector_res, float, 64, 1));
-+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx64, expectedfms0_static, "");
-+  VECT_VAR (vector_res, float, 64, 1) =
-+    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
-+		VECT_VAR (vsrc_2, float, 64, 1), delem0);
-+  vst1_f64 (VECT_VAR (result, float, 64, 1),
-+	     VECT_VAR (vector_res, float, 64, 1));
-+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx64, expectedfma0_static, "");
-+
-+  VECT_VAR_DECL (buf_src_3, float, 64, 1) [] = {DA2};
-+  VECT_VAR_DECL (buf_src_4, float, 64, 1) [] = {DB2};
-+  VLOAD (vsrc_1, buf_src_3, , float, f, 64, 1);
-+  VLOAD (vsrc_2, buf_src_4, , float, f, 64, 1);
-+  VECT_VAR (vector_res, float, 64, 1) =
-+    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
-+		VECT_VAR (vsrc_2, float, 64, 1), delem1);
-+  vst1_f64 (VECT_VAR (result, float, 64, 1),
-+	     VECT_VAR (vector_res, float, 64, 1));
-+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx64, expectedfms1_static, "");
-+  VECT_VAR (vector_res, float, 64, 1) =
-+    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
-+		VECT_VAR (vsrc_2, float, 64, 1), delem1);
-+  vst1_f64 (VECT_VAR (result, float, 64, 1),
-+	     VECT_VAR (vector_res, float, 64, 1));
-+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx64, expectedfma1_static, "");
-+
-+  VECT_VAR_DECL (buf_src_5, float, 64, 1) [] = {DA4};
-+  VECT_VAR_DECL (buf_src_6, float, 64, 1) [] = {DB4};
-+  VLOAD (vsrc_1, buf_src_5, , float, f, 64, 1);
-+  VLOAD (vsrc_2, buf_src_6, , float, f, 64, 1);
-+  VECT_VAR (vector_res, float, 64, 1) =
-+    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
-+		VECT_VAR (vsrc_2, float, 64, 1), delem2);
-+  vst1_f64 (VECT_VAR (result, float, 64, 1),
-+	     VECT_VAR (vector_res, float, 64, 1));
-+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx64, expectedfms2_static, "");
-+  VECT_VAR (vector_res, float, 64, 1) =
-+    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
-+		VECT_VAR (vsrc_2, float, 64, 1), delem2);
-+  vst1_f64 (VECT_VAR (result, float, 64, 1),
-+	     VECT_VAR (vector_res, float, 64, 1));
-+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx64, expectedfma2_static, "");
-+
-+  VECT_VAR_DECL (buf_src_7, float, 64, 1) [] = {DA6};
-+  VECT_VAR_DECL (buf_src_8, float, 64, 1) [] = {DB6};
-+  VLOAD (vsrc_1, buf_src_7, , float, f, 64, 1);
-+  VLOAD (vsrc_2, buf_src_8, , float, f, 64, 1);
-+  VECT_VAR (vector_res, float, 64, 1) =
-+    vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
-+		VECT_VAR (vsrc_2, float, 64, 1), delem3);
-+  vst1_f64 (VECT_VAR (result, float, 64, 1),
-+	     VECT_VAR (vector_res, float, 64, 1));
-+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx64, expectedfms3_static, "");
-+  VECT_VAR (vector_res, float, 64, 1) =
-+    vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1),
-+		VECT_VAR (vsrc_2, float, 64, 1), delem3);
-+  vst1_f64 (VECT_VAR (result, float, 64, 1),
-+	     VECT_VAR (vector_res, float, 64, 1));
-+  CHECK_FP (TEST_MSG, float, 64, 1, PRIx64, expectedfma3_static, "");
-+}
-+#endif
-+
-+int
-+main (void)
-+{
-+#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA)
-+  exec_vfma_vfms_n ();
-+#endif
-+  return 0;
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfmsh_f16_1.c
-@@ -0,0 +1,40 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected[] =
-+{
-+  0x0000 /* 0.000000 */,
-+  0x8000 /* -0.000000 */,
-+  0x42af /* 3.341797 */,
-+  0x5043 /* 34.093750 */,
-+  0xccd2 /* -19.281250 */,
-+  0x3712 /* 0.441895 */,
-+  0x3acc /* 0.849609 */,
-+  0x4848 /* 8.562500 */,
-+  0xcc43 /* -17.046875 */,
-+  0xd65c /* -101.750000 */,
-+  0x4185 /* 2.759766 */,
-+  0xcd39 /* -20.890625 */,
-+  0xd45b /* -69.687500 */,
-+  0x5241 /* 50.031250 */,
-+  0xc675 /* -6.457031 */,
-+  0x4d07 /* 20.109375 */,
-+  0x7c00 /* inf */,
-+  0xfc00 /* -inf */
-+};
-+
-+#define TEST_MSG "VFMSH_F16"
-+#define INSN_NAME vfmsh_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "ternary_scalar_op.inc"
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_high.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_high.c
-@@ -63,8 +63,8 @@ void exec_vget_high (void)
-   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
-   CHECK(TEST_MSG, uint, 64, 1, PRIx64, expected, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
--  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, "");
- }
- 
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_lane.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_lane.c
-@@ -13,6 +13,7 @@ uint32_t   expected_u32  = 0xfffffff1;
- uint64_t   expected_u64  = 0xfffffffffffffff0;
- poly8_t    expected_p8   = 0xf6;
- poly16_t   expected_p16  = 0xfff2;
-+hfloat16_t expected_f16  = 0xcb80;
- hfloat32_t expected_f32  = 0xc1700000;
- 
- int8_t     expectedq_s8  = 0xff;
-@@ -25,6 +26,7 @@ uint32_t   expectedq_u32 = 0xfffffff2;
- uint64_t   expectedq_u64 = 0xfffffffffffffff1;
- poly8_t    expectedq_p8  = 0xfe;
- poly16_t   expectedq_p16 = 0xfff6;
-+hfloat16_t expectedq_f16 = 0xca80;
- hfloat32_t expectedq_f32 = 0xc1500000;
- 
- int error_found = 0;
-@@ -52,6 +54,12 @@ void exec_vget_lane (void)
-     uint32_t var_int32;
-     float32_t var_float32;
-   } var_int32_float32;
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  union {
-+    uint16_t var_int16;
-+    float16_t var_float16;
-+  } var_int16_float16;
-+#endif
- 
- #define TEST_VGET_LANE_FP(Q, T1, T2, W, N, L)				   \
-   VAR(var, T1, W) = vget##Q##_lane_##T2##W(VECT_VAR(vector, T1, W, N), L); \
-@@ -81,10 +89,17 @@ void exec_vget_lane (void)
-   VAR_DECL(var, uint, 64);
-   VAR_DECL(var, poly, 8);
-   VAR_DECL(var, poly, 16);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  VAR_DECL(var, float, 16);
-+#endif
-   VAR_DECL(var, float, 32);
- 
-   /* Initialize input values.  */
-   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  VLOAD(vector, buffer, , float, f, 16, 4);
-+  VLOAD(vector, buffer, q, float, f, 16, 8);
-+#endif
-   VLOAD(vector, buffer, , float, f, 32, 2);
-   VLOAD(vector, buffer, q, float, f, 32, 4);
- 
-@@ -99,6 +114,9 @@ void exec_vget_lane (void)
-   TEST_VGET_LANE(, uint, u, 64, 1, 0);
-   TEST_VGET_LANE(, poly, p, 8, 8, 6);
-   TEST_VGET_LANE(, poly, p, 16, 4, 2);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VGET_LANE_FP(, float, f, 16, 4, 1);
-+#endif
-   TEST_VGET_LANE_FP(, float, f, 32, 2, 1);
- 
-   TEST_VGET_LANE(q, int, s, 8, 16, 15);
-@@ -111,6 +129,9 @@ void exec_vget_lane (void)
-   TEST_VGET_LANE(q, uint, u, 64, 2, 1);
-   TEST_VGET_LANE(q, poly, p, 8, 16, 14);
-   TEST_VGET_LANE(q, poly, p, 16, 8, 6);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VGET_LANE_FP(q, float, f, 16, 8, 3);
-+#endif
-   TEST_VGET_LANE_FP(q, float, f, 32, 4, 3);
- }
- 
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_low.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_low.c
-@@ -63,8 +63,8 @@ void exec_vget_low (void)
-   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
-   CHECK(TEST_MSG, uint, 64, 1, PRIx64, expected, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
--  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-   CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
- #endif
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2_lane_f16_indices_1.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2_lane_f16_indices_1.c
-@@ -2,6 +2,7 @@
- 
- /* { dg-do compile } */
- /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
-+/* { dg-require-effective-target arm_neon_fp16_ok { target { arm*-*-* } } } */
- 
- float16x4x2_t
- f_vld2_lane_f16 (float16_t * p, float16x4x2_t v)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2q_lane_f16_indices_1.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2q_lane_f16_indices_1.c
-@@ -2,6 +2,7 @@
- 
- /* { dg-do compile } */
- /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
-+/* { dg-require-effective-target arm_neon_fp16_ok { target { arm*-*-* } } } */
- 
- float16x8x2_t
- f_vld2q_lane_f16 (float16_t * p, float16x8x2_t v)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3_lane_f16_indices_1.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3_lane_f16_indices_1.c
-@@ -2,6 +2,7 @@
- 
- /* { dg-do compile } */
- /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
-+/* { dg-require-effective-target arm_neon_fp16_ok { target { arm*-*-* } } } */
- 
- float16x4x3_t
- f_vld3_lane_f16 (float16_t * p, float16x4x3_t v)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3q_lane_f16_indices_1.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3q_lane_f16_indices_1.c
-@@ -2,6 +2,7 @@
- 
- /* { dg-do compile } */
- /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
-+/* { dg-require-effective-target arm_neon_fp16_ok { target { arm*-*-* } } } */
- 
- float16x8x3_t
- f_vld3q_lane_f16 (float16_t * p, float16x8x3_t v)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4_lane_f16_indices_1.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4_lane_f16_indices_1.c
-@@ -2,6 +2,7 @@
- 
- /* { dg-do compile } */
- /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
-+/* { dg-require-effective-target arm_neon_fp16_ok { target { arm*-*-* } } } */
- 
- float16x4x4_t
- f_vld4_lane_f16 (float16_t * p, float16x4x4_t v)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4q_lane_f16_indices_1.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4q_lane_f16_indices_1.c
-@@ -2,6 +2,7 @@
- 
- /* { dg-do compile } */
- /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
-+/* { dg-require-effective-target arm_neon_fp16_ok { target { arm*-*-* } } } */
- 
- float16x8x4_t
- f_vld4q_lane_f16 (float16_t * p, float16x8x4_t v)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX.c
-@@ -528,8 +528,8 @@ void exec_vldX (void)
-     CHECK(test_name, uint, 16, 4, PRIx16, EXPECTED, comment);		\
-     CHECK(test_name, uint, 32, 2, PRIx32, EXPECTED, comment);		\
-     CHECK(test_name, uint, 64, 1, PRIx64, EXPECTED, comment);		\
--    CHECK(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);		\
--    CHECK(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);		\
-+    CHECK_POLY(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);	\
-+    CHECK_POLY(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);	\
-     CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment);	\
- 									\
-     CHECK(test_name, int, 8, 16, PRIx8, EXPECTED, comment);		\
-@@ -538,8 +538,8 @@ void exec_vldX (void)
-     CHECK(test_name, uint, 8, 16, PRIx8, EXPECTED, comment);		\
-     CHECK(test_name, uint, 16, 8, PRIx16, EXPECTED, comment);		\
-     CHECK(test_name, uint, 32, 4, PRIx32, EXPECTED, comment);		\
--    CHECK(test_name, poly, 8, 16, PRIx8, EXPECTED, comment);		\
--    CHECK(test_name, poly, 16, 8, PRIx16, EXPECTED, comment);		\
-+    CHECK_POLY(test_name, poly, 8, 16, PRIx8, EXPECTED, comment);	\
-+    CHECK_POLY(test_name, poly, 16, 8, PRIx16, EXPECTED, comment);	\
-     CHECK_FP(test_name, float, 32, 4, PRIx32, EXPECTED, comment)
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c
-@@ -270,8 +270,8 @@ void exec_vldX_dup (void)
-     CHECK(test_name, uint, 16, 4, PRIx16, EXPECTED, comment);		\
-     CHECK(test_name, uint, 32, 2, PRIx32, EXPECTED, comment);		\
-     CHECK(test_name, uint, 64, 1, PRIx64, EXPECTED, comment);		\
--    CHECK(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);		\
--    CHECK(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);		\
-+    CHECK_POLY(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);	\
-+    CHECK_POLY(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);	\
-     CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment)
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c
-@@ -451,14 +451,14 @@ void exec_vldX_lane (void)
-     CHECK(test_name, uint, 8, 8, PRIx8, EXPECTED, comment);		\
-     CHECK(test_name, uint, 16, 4, PRIx16, EXPECTED, comment);		\
-     CHECK(test_name, uint, 32, 2, PRIx32, EXPECTED, comment);		\
--    CHECK(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);		\
--    CHECK(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);		\
-+    CHECK_POLY(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);	\
-+    CHECK_POLY(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);	\
-     CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment);	\
-     CHECK(test_name, int, 16, 8, PRIx16, EXPECTED, comment);		\
-     CHECK(test_name, int, 32, 4, PRIx32, EXPECTED, comment);		\
-     CHECK(test_name, uint, 16, 8, PRIx16, EXPECTED, comment);		\
-     CHECK(test_name, uint, 32, 4, PRIx32, EXPECTED, comment);		\
--    CHECK(test_name, poly, 16, 8, PRIx16, EXPECTED, comment);		\
-+    CHECK_POLY(test_name, poly, 16, 8, PRIx16, EXPECTED, comment);	\
-     CHECK_FP(test_name, float, 32, 4, PRIx32, EXPECTED, comment)
- 
- #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmax.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmax.c
-@@ -7,6 +7,10 @@
- 
- #define HAS_FLOAT_VARIANT
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+#define HAS_FLOAT16_VARIANT
-+#endif
-+
- /* Expected results.  */
- VECT_VAR_DECL(expected,int,8,8) [] = { 0xf3, 0xf3, 0xf3, 0xf3,
- 				       0xf4, 0xf5, 0xf6, 0xf7 };
-@@ -16,6 +20,9 @@ VECT_VAR_DECL(expected,uint,8,8) [] = { 0xf3, 0xf3, 0xf3, 0xf3,
- 					0xf4, 0xf5, 0xf6, 0xf7 };
- VECT_VAR_DECL(expected,uint,16,4) [] = { 0xfff1, 0xfff1, 0xfff2, 0xfff3 };
- VECT_VAR_DECL(expected,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xcbc0, 0xcb80, 0xcb00, 0xca80 };
-+#endif
- VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1780000, 0xc1700000 };
- VECT_VAR_DECL(expected,int,8,16) [] = { 0xf4, 0xf4, 0xf4, 0xf4,
- 					0xf4, 0xf5, 0xf6, 0xf7,
-@@ -33,10 +40,36 @@ VECT_VAR_DECL(expected,uint,16,8) [] = { 0xfff2, 0xfff2, 0xfff2, 0xfff3,
- 					 0xfff4, 0xfff5, 0xfff6, 0xfff7 };
- VECT_VAR_DECL(expected,uint,32,4) [] = { 0xfffffff1, 0xfffffff1,
- 					 0xfffffff2, 0xfffffff3 };
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0xcb40, 0xcb40, 0xcb00, 0xca80,
-+					      0xca00, 0xc980, 0xc900, 0xc880 };
-+#endif
- VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1680000, 0xc1680000,
- 					   0xc1600000, 0xc1500000 };
- 
- /* Expected results with special FP values.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_nan, hfloat, 16, 8) [] = { 0x7e00, 0x7e00,
-+						  0x7e00, 0x7e00,
-+						  0x7e00, 0x7e00,
-+						  0x7e00, 0x7e00 };
-+VECT_VAR_DECL(expected_mnan, hfloat, 16, 8) [] = { 0x7e00, 0x7e00,
-+						   0x7e00, 0x7e00,
-+						   0x7e00, 0x7e00,
-+						   0x7e00, 0x7e00 };
-+VECT_VAR_DECL(expected_inf, hfloat, 16, 8) [] = { 0x7c00, 0x7c00,
-+						  0x7c00, 0x7c00,
-+						  0x7c00, 0x7c00,
-+						  0x7c00, 0x7c00 };
-+VECT_VAR_DECL(expected_minf, hfloat, 16, 8) [] = { 0x3c00, 0x3c00,
-+						   0x3c00, 0x3c00,
-+						   0x3c00, 0x3c00,
-+						   0x3c00, 0x3c00 };
-+VECT_VAR_DECL(expected_zero1, hfloat, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0,
-+						    0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL(expected_zero2, hfloat, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0,
-+						    0x0, 0x0, 0x0, 0x0 };
-+#endif
- VECT_VAR_DECL(expected_nan,hfloat,32,4) [] = { 0x7fc00000, 0x7fc00000,
- 					       0x7fc00000, 0x7fc00000 };
- VECT_VAR_DECL(expected_mnan,hfloat,32,4) [] = { 0x7fc00000, 0x7fc00000,
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmaxh_f16_1.c
-@@ -0,0 +1,34 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+#define A 123.4
-+#define B -567.8
-+#define C -34.8
-+#define D 1024
-+#define E 663.1
-+#define F 169.1
-+#define G -4.8
-+#define H 77
-+
-+float16_t input_1[] = { A, B, C, D };
-+float16_t input_2[] = { E, F, G, H };
-+float16_t expected[] = { E, F, G, D };
-+
-+#define TEST_MSG "VMAXH_F16"
-+#define INSN_NAME vmaxh_f16
-+
-+#define INPUT_1 input_1
-+#define INPUT_2 input_2
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "binary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmaxnm_1.c
-@@ -0,0 +1,47 @@
-+/* This file tests an intrinsic which currently has only an f16 variant and that
-+   is only available when FP16 arithmetic instructions are supported.  */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+#define INSN_NAME vmaxnm
-+#define TEST_MSG "VMAXNM/VMAXNMQ"
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+#define HAS_FLOAT16_VARIANT
-+#endif
-+
-+/* Expected results.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xcbc0, 0xcb80, 0xcb00, 0xca80 };
-+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0xcb40, 0xcb40, 0xcb00, 0xca80,
-+					      0xca00, 0xc980, 0xc900, 0xc880 };
-+#endif
-+
-+/* Expected results with special FP values.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_nan, hfloat, 16, 8) [] = { 0x3c00, 0x3c00,
-+						  0x3c00, 0x3c00,
-+						  0x3c00, 0x3c00,
-+						  0x3c00, 0x3c00 };
-+VECT_VAR_DECL(expected_mnan, hfloat, 16, 8) [] = { 0x3c00, 0x3c00,
-+						   0x3c00, 0x3c00,
-+						   0x3c00, 0x3c00,
-+						   0x3c00, 0x3c00 };
-+VECT_VAR_DECL(expected_inf, hfloat, 16, 8) [] = { 0x7c00, 0x7c00,
-+						  0x7c00, 0x7c00,
-+						  0x7c00, 0x7c00,
-+						  0x7c00, 0x7c00 };
-+VECT_VAR_DECL(expected_minf, hfloat, 16, 8) [] = { 0x3c00, 0x3c00,
-+						   0x3c00, 0x3c00,
-+						   0x3c00, 0x3c00,
-+						   0x3c00, 0x3c00 };
-+VECT_VAR_DECL(expected_zero1, hfloat, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0,
-+						    0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL(expected_zero2, hfloat, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0,
-+						    0x0, 0x0, 0x0, 0x0 };
-+#endif
-+
-+#include "binary_op_float.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmaxnmh_f16_1.c
-@@ -0,0 +1,42 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+#define INFF __builtin_inf ()
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected[] =
-+{
-+  0x3c00 /* 1.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0x4000 /* 2.000000 */,
-+  0x5640 /* 100.000000 */,
-+  0x4f80 /* 30.000000 */,
-+  0x3666 /* 0.399902 */,
-+  0x3800 /* 0.500000 */,
-+  0x3d52 /* 1.330078 */,
-+  0xc64d /* -6.300781 */,
-+  0x4d00 /* 20.000000 */,
-+  0x355d /* 0.335205 */,
-+  0x409a /* 2.300781 */,
-+  0x3c00 /* 1.000000 */,
-+  0x4a91 /* 13.132812 */,
-+  0x34f6 /* 0.310059 */,
-+  0x4d00 /* 20.000000 */,
-+  0x7c00 /* inf */,
-+  0x7c00 /* inf */
-+};
-+
-+#define TEST_MSG "VMAXNMH_F16"
-+#define INSN_NAME vmaxnmh_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "binary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmaxnmv_f16_1.c
-@@ -0,0 +1,131 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_neon } */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+#define FP16_C(a) ((__fp16) a)
-+#define A0 FP16_C (34.8)
-+#define B0 FP16_C (__builtin_nanf (""))
-+#define C0 FP16_C (-__builtin_nanf (""))
-+#define D0 FP16_C (0.0)
-+
-+#define A1 FP16_C (1025.8)
-+#define B1 FP16_C (13.4)
-+#define C1 FP16_C (__builtin_nanf (""))
-+#define D1 FP16_C (10)
-+#define E1 FP16_C (-0.0)
-+#define F1 FP16_C (-__builtin_nanf (""))
-+#define G1 FP16_C (0.0)
-+#define H1 FP16_C (10)
-+
-+/* Expected results for vmaxnmv.  */
-+uint16_t expect = 0x505A /* A0.  */;
-+uint16_t expect_alt = 0x6402 /* A1.  */;
-+
-+void exec_vmaxnmv_f16 (void)
-+{
-+#undef TEST_MSG
-+#define TEST_MSG "VMAXNMV (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc, float, 16, 4);
-+  VECT_VAR_DECL (buf_src, float, 16, 4) [] = {A0, B0, C0, D0};
-+  VLOAD (vsrc, buf_src, , float, f, 16, 4);
-+  float16_t vector_res = vmaxnmv_f16 (VECT_VAR (vsrc, float, 16, 4));
-+
-+  if (* (uint16_t *) &vector_res != expect)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src1, float, 16, 4) [] = {B0, A0, C0, D0};
-+  VLOAD (vsrc, buf_src1, , float, f, 16, 4);
-+  vector_res = vmaxnmv_f16 (VECT_VAR (vsrc, float, 16, 4));
-+
-+  if (* (uint16_t *) &vector_res != expect)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src2, float, 16, 4) [] = {B0, C0, A0, D0};
-+  VLOAD (vsrc, buf_src2, , float, f, 16, 4);
-+  vector_res = vmaxnmv_f16 (VECT_VAR (vsrc, float, 16, 4));
-+
-+  if (* (uint16_t *) &vector_res != expect)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src3, float, 16, 4) [] = {B0, C0, D0, A0};
-+  VLOAD (vsrc, buf_src3, , float, f, 16, 4);
-+  vector_res = vmaxnmv_f16 (VECT_VAR (vsrc, float, 16, 4));
-+
-+  if (* (uint16_t *) &vector_res != expect)
-+    abort ();
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VMAXNMVQ (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc, float, 16, 8);
-+  VECT_VAR_DECL (buf_src, float, 16, 8) [] = {A1, B1, C1, D1, E1, F1, G1, H1};
-+  VLOAD (vsrc, buf_src, q, float, f, 16, 8);
-+  vector_res = vmaxnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src1, float, 16, 8) [] = {B1, A1, C1, D1, E1, F1, G1, H1};
-+  VLOAD (vsrc, buf_src1, q, float, f, 16, 8);
-+  vector_res = vmaxnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src2, float, 16, 8) [] = {B1, C1, A1, D1, E1, F1, G1, H1};
-+  VLOAD (vsrc, buf_src2, q, float, f, 16, 8);
-+  vector_res = vmaxnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src3, float, 16, 8) [] = {B1, C1, D1, A1, E1, F1, G1, H1};
-+  VLOAD (vsrc, buf_src3, q, float, f, 16, 8);
-+  vector_res = vmaxnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src4, float, 16, 8) [] = {B1, C1, D1, E1, A1, F1, G1, H1};
-+  VLOAD (vsrc, buf_src4, q, float, f, 16, 8);
-+  vector_res = vmaxnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src5, float, 16, 8) [] = {B1, C1, D1, E1, F1, A1, G1, H1};
-+  VLOAD (vsrc, buf_src5, q, float, f, 16, 8);
-+  vector_res = vmaxnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src6, float, 16, 8) [] = {B1, C1, D1, E1, F1, G1, A1, H1};
-+  VLOAD (vsrc, buf_src6, q, float, f, 16, 8);
-+  vector_res = vmaxnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src7, float, 16, 8) [] = {B1, C1, D1, E1, F1, G1, H1, A1};
-+  VLOAD (vsrc, buf_src7, q, float, f, 16, 8);
-+  vector_res = vmaxnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+}
-+
-+int
-+main (void)
-+{
-+  exec_vmaxnmv_f16 ();
-+  return 0;
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmaxv_f16_1.c
-@@ -0,0 +1,131 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_neon } */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+#define FP16_C(a) ((__fp16) a)
-+#define A0 FP16_C (123.4)
-+#define B0 FP16_C (-567.8)
-+#define C0 FP16_C (34.8)
-+#define D0 FP16_C (0.0)
-+
-+#define A1 FP16_C (1025.8)
-+#define B1 FP16_C (13.4)
-+#define C1 FP16_C (-567.8)
-+#define D1 FP16_C (10)
-+#define E1 FP16_C (-0.0)
-+#define F1 FP16_C (567.8)
-+#define G1 FP16_C (0.0)
-+#define H1 FP16_C (10)
-+
-+/* Expected results for vmaxv.  */
-+uint16_t expect = 0x57B6 /* A0.  */;
-+uint16_t expect_alt = 0x6402 /* A1.  */;
-+
-+void exec_vmaxv_f16 (void)
-+{
-+#undef TEST_MSG
-+#define TEST_MSG "VMAXV (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc, float, 16, 4);
-+  VECT_VAR_DECL (buf_src, float, 16, 4) [] = {A0, B0, C0, D0};
-+  VLOAD (vsrc, buf_src, , float, f, 16, 4);
-+  float16_t vector_res = vmaxv_f16 (VECT_VAR (vsrc, float, 16, 4));
-+
-+  if (* (uint16_t *) &vector_res != expect)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src1, float, 16, 4) [] = {B0, A0, C0, D0};
-+  VLOAD (vsrc, buf_src1, , float, f, 16, 4);
-+  vector_res = vmaxv_f16 (VECT_VAR (vsrc, float, 16, 4));
-+
-+  if (* (uint16_t *) &vector_res != expect)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src2, float, 16, 4) [] = {B0, C0, A0, D0};
-+  VLOAD (vsrc, buf_src2, , float, f, 16, 4);
-+  vector_res = vmaxv_f16 (VECT_VAR (vsrc, float, 16, 4));
-+
-+  if (* (uint16_t *) &vector_res != expect)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src3, float, 16, 4) [] = {B0, C0, D0, A0};
-+  VLOAD (vsrc, buf_src3, , float, f, 16, 4);
-+  vector_res = vmaxv_f16 (VECT_VAR (vsrc, float, 16, 4));
-+
-+  if (* (uint16_t *) &vector_res != expect)
-+    abort ();
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VMAXVQ (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc, float, 16, 8);
-+  VECT_VAR_DECL (buf_src, float, 16, 8) [] = {A1, B1, C1, D1, E1, F1, G1, H1};
-+  VLOAD (vsrc, buf_src, q, float, f, 16, 8);
-+  vector_res = vmaxvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src1, float, 16, 8) [] = {B1, A1, C1, D1, E1, F1, G1, H1};
-+  VLOAD (vsrc, buf_src1, q, float, f, 16, 8);
-+  vector_res = vmaxvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src2, float, 16, 8) [] = {B1, C1, A1, D1, E1, F1, G1, H1};
-+  VLOAD (vsrc, buf_src2, q, float, f, 16, 8);
-+  vector_res = vmaxvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src3, float, 16, 8) [] = {B1, C1, D1, A1, E1, F1, G1, H1};
-+  VLOAD (vsrc, buf_src3, q, float, f, 16, 8);
-+  vector_res = vmaxvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src4, float, 16, 8) [] = {B1, C1, D1, E1, A1, F1, G1, H1};
-+  VLOAD (vsrc, buf_src4, q, float, f, 16, 8);
-+  vector_res = vmaxvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src5, float, 16, 8) [] = {B1, C1, D1, E1, F1, A1, G1, H1};
-+  VLOAD (vsrc, buf_src5, q, float, f, 16, 8);
-+  vector_res = vmaxvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src6, float, 16, 8) [] = {B1, C1, D1, E1, F1, G1, A1, H1};
-+  VLOAD (vsrc, buf_src6, q, float, f, 16, 8);
-+  vector_res = vmaxvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src7, float, 16, 8) [] = {B1, C1, D1, E1, F1, G1, H1, A1};
-+  VLOAD (vsrc, buf_src7, q, float, f, 16, 8);
-+  vector_res = vmaxvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+}
-+
-+int
-+main (void)
-+{
-+  exec_vmaxv_f16 ();
-+  return 0;
-+}
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmin.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmin.c
-@@ -7,6 +7,10 @@
- 
- #define HAS_FLOAT_VARIANT
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+#define HAS_FLOAT16_VARIANT
-+#endif
-+
- /* Expected results.  */
- VECT_VAR_DECL(expected,int,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
- 				       0xf3, 0xf3, 0xf3, 0xf3 };
-@@ -16,6 +20,9 @@ VECT_VAR_DECL(expected,uint,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
- 					0xf3, 0xf3, 0xf3, 0xf3 };
- VECT_VAR_DECL(expected,uint,16,4) [] = { 0xfff0, 0xfff1, 0xfff1, 0xfff1 };
- VECT_VAR_DECL(expected,uint,32,2) [] = { 0xfffffff0, 0xfffffff0 };
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xcc00, 0xcbc0, 0xcbc0, 0xcbc0 };
-+#endif
- VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800000, 0xc1780000 };
- VECT_VAR_DECL(expected,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
- 					0xf4, 0xf4, 0xf4, 0xf4,
-@@ -31,11 +38,41 @@ VECT_VAR_DECL(expected,uint,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
- 					 0xf9, 0xf9, 0xf9, 0xf9 };
- VECT_VAR_DECL(expected,uint,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff2,
- 					 0xfff2, 0xfff2, 0xfff2, 0xfff2 };
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0xcc00, 0xcb80, 0xcb40, 0xcb40,
-+					      0xcb40, 0xcb40, 0xcb40, 0xcb40 };
-+#endif
- VECT_VAR_DECL(expected,uint,32,4) [] = { 0xfffffff0, 0xfffffff1,
- 					 0xfffffff1, 0xfffffff1 };
- VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
- 					   0xc1680000, 0xc1680000 };
- /* Expected results with special FP values.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_nan, hfloat, 16, 8) [] = { 0x7e00, 0x7e00,
-+						  0x7e00, 0x7e00,
-+						  0x7e00, 0x7e00,
-+						  0x7e00, 0x7e00 };
-+VECT_VAR_DECL(expected_mnan, hfloat, 16, 8) [] = { 0x7e00, 0x7e00,
-+						   0x7e00, 0x7e00,
-+						   0x7e00, 0x7e00,
-+						   0x7e00, 0x7e00 };
-+VECT_VAR_DECL(expected_inf, hfloat, 16, 8) [] = { 0x3c00, 0x3c00,
-+						  0x3c00, 0x3c00,
-+						  0x3c00, 0x3c00,
-+						  0x3c00, 0x3c00 };
-+VECT_VAR_DECL(expected_minf, hfloat, 16, 8) [] = { 0xfc00, 0xfc00,
-+						   0xfc00, 0xfc00,
-+						   0xfc00, 0xfc00,
-+						   0xfc00, 0xfc00 };
-+VECT_VAR_DECL(expected_zero1, hfloat, 16, 8) [] = { 0x8000, 0x8000,
-+						    0x8000, 0x8000,
-+						    0x8000, 0x8000,
-+						    0x8000, 0x8000 };
-+VECT_VAR_DECL(expected_zero2, hfloat, 16, 8) [] = { 0x8000, 0x8000,
-+						    0x8000, 0x8000,
-+						    0x8000, 0x8000,
-+						    0x8000, 0x8000 };
-+#endif
- VECT_VAR_DECL(expected_nan,hfloat,32,4) [] = { 0x7fc00000, 0x7fc00000,
- 					       0x7fc00000, 0x7fc00000 };
- VECT_VAR_DECL(expected_mnan,hfloat,32,4) [] = { 0x7fc00000, 0x7fc00000,
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vminh_f16_1.c
-@@ -0,0 +1,34 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+#define A 123.4
-+#define B -567.8
-+#define C -34.8
-+#define D 1024
-+#define E 663.1
-+#define F 169.1
-+#define G -4.8
-+#define H 77
-+
-+float16_t input_1[] = { A, B, C, D };
-+float16_t input_2[] = { E, F, G, H };
-+float16_t expected[] = { A, B, C, H };
-+
-+#define TEST_MSG "VMINH_F16"
-+#define INSN_NAME vminh_f16
-+
-+#define INPUT_1 input_1
-+#define INPUT_2 input_2
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "binary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vminnm_1.c
-@@ -0,0 +1,51 @@
-+/* This file tests an intrinsic which currently has only an f16 variant and that
-+   is only available when FP16 arithmetic instructions are supported.  */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+#define INSN_NAME vminnm
-+#define TEST_MSG "VMINNM/VMINMQ"
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+#define HAS_FLOAT16_VARIANT
-+#endif
-+
-+/* Expected results.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xcc00, 0xcbc0, 0xcbc0, 0xcbc0 };
-+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0xcc00, 0xcb80, 0xcb40, 0xcb40,
-+					      0xcb40, 0xcb40, 0xcb40, 0xcb40 };
-+#endif
-+
-+/* Expected results with special FP values.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_nan, hfloat, 16, 8) [] = { 0x3c00, 0x3c00,
-+						  0x3c00, 0x3c00,
-+						  0x3c00, 0x3c00,
-+						  0x3c00, 0x3c00 };
-+VECT_VAR_DECL(expected_mnan, hfloat, 16, 8) [] = { 0x3c00, 0x3c00,
-+						   0x3c00, 0x3c00,
-+						   0x3c00, 0x3c00,
-+						   0x3c00, 0x3c00 };
-+VECT_VAR_DECL(expected_inf, hfloat, 16, 8) [] = { 0x3c00, 0x3c00,
-+						  0x3c00, 0x3c00,
-+						  0x3c00, 0x3c00,
-+						  0x3c00, 0x3c00 };
-+VECT_VAR_DECL(expected_minf, hfloat, 16, 8) [] = { 0xfc00, 0xfc00,
-+						   0xfc00, 0xfc00,
-+						   0xfc00, 0xfc00,
-+						   0xfc00, 0xfc00 };
-+VECT_VAR_DECL(expected_zero1, hfloat, 16, 8) [] = { 0x8000, 0x8000,
-+						    0x8000, 0x8000,
-+						    0x8000, 0x8000,
-+						    0x8000, 0x8000 };
-+VECT_VAR_DECL(expected_zero2, hfloat, 16, 8) [] = { 0x8000, 0x8000,
-+						    0x8000, 0x8000,
-+						    0x8000, 0x8000,
-+						    0x8000, 0x8000 };
-+#endif
-+
-+#include "binary_op_float.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vminnmh_f16_1.c
-@@ -0,0 +1,42 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+#define INFF __builtin_inf ()
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected[] =
-+{
-+  0x0000 /* 0.000000 */,
-+  0x8000 /* -0.000000 */,
-+  0xc454 /* -4.328125 */,
-+  0x4233 /* 3.099609 */,
-+  0x4d00 /* 20.000000 */,
-+  0xa51f /* -0.020004 */,
-+  0xc09a /* -2.300781 */,
-+  0xc73b /* -7.230469 */,
-+  0xc79a /* -7.601562 */,
-+  0x34f6 /* 0.310059 */,
-+  0xc73b /* -7.230469 */,
-+  0x3800 /* 0.500000 */,
-+  0xc79a /* -7.601562 */,
-+  0x451a /* 5.101562 */,
-+  0xc64d /* -6.300781 */,
-+  0x3556 /* 0.333496 */,
-+  0xfc00 /* -inf */,
-+  0xfc00 /* -inf */
-+};
-+
-+#define TEST_MSG "VMINNMH_F16"
-+#define INSN_NAME vminnmh_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "binary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vminnmv_f16_1.c
-@@ -0,0 +1,131 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_neon } */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+#define FP16_C(a) ((__fp16) a)
-+#define A0 FP16_C (-567.8)
-+#define B0 FP16_C (__builtin_nanf (""))
-+#define C0 FP16_C (34.8)
-+#define D0 FP16_C (-__builtin_nanf (""))
-+
-+#define A1 FP16_C (-567.8)
-+#define B1 FP16_C (1025.8)
-+#define C1 FP16_C (-__builtin_nanf (""))
-+#define D1 FP16_C (10)
-+#define E1 FP16_C (-0.0)
-+#define F1 FP16_C (__builtin_nanf (""))
-+#define G1 FP16_C (0.0)
-+#define H1 FP16_C (10)
-+
-+/* Expected results for vminnmv.  */
-+uint16_t expect = 0xE070 /* A0.  */;
-+uint16_t expect_alt = 0xE070 /* A1.  */;
-+
-+void exec_vminnmv_f16 (void)
-+{
-+#undef TEST_MSG
-+#define TEST_MSG "VMINNMV (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc, float, 16, 4);
-+  VECT_VAR_DECL (buf_src, float, 16, 4) [] = {A0, B0, C0, D0};
-+  VLOAD (vsrc, buf_src, , float, f, 16, 4);
-+  float16_t vector_res = vminnmv_f16 (VECT_VAR (vsrc, float, 16, 4));
-+
-+  if (* (uint16_t *) &vector_res != expect)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src1, float, 16, 4) [] = {B0, A0, C0, D0};
-+  VLOAD (vsrc, buf_src1, , float, f, 16, 4);
-+  vector_res = vminnmv_f16 (VECT_VAR (vsrc, float, 16, 4));
-+
-+  if (* (uint16_t *) &vector_res != expect)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src2, float, 16, 4) [] = {B0, C0, A0, D0};
-+  VLOAD (vsrc, buf_src2, , float, f, 16, 4);
-+  vector_res = vminnmv_f16 (VECT_VAR (vsrc, float, 16, 4));
-+
-+  if (* (uint16_t *) &vector_res != expect)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src3, float, 16, 4) [] = {B0, C0, D0, A0};
-+  VLOAD (vsrc, buf_src3, , float, f, 16, 4);
-+  vector_res = vminnmv_f16 (VECT_VAR (vsrc, float, 16, 4));
-+
-+  if (* (uint16_t *) &vector_res != expect)
-+    abort ();
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VMINNMVQ (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc, float, 16, 8);
-+  VECT_VAR_DECL (buf_src, float, 16, 8) [] = {A1, B1, C1, D1, E1, F1, G1, H1};
-+  VLOAD (vsrc, buf_src, q, float, f, 16, 8);
-+  vector_res = vminnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src1, float, 16, 8) [] = {B1, A1, C1, D1, E1, F1, G1, H1};
-+  VLOAD (vsrc, buf_src1, q, float, f, 16, 8);
-+  vector_res = vminnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src2, float, 16, 8) [] = {B1, C1, A1, D1, E1, F1, G1, H1};
-+  VLOAD (vsrc, buf_src2, q, float, f, 16, 8);
-+  vector_res = vminnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src3, float, 16, 8) [] = {B1, C1, D1, A1, E1, F1, G1, H1};
-+  VLOAD (vsrc, buf_src3, q, float, f, 16, 8);
-+  vector_res = vminnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src4, float, 16, 8) [] = {B1, C1, D1, E1, A1, F1, G1, H1};
-+  VLOAD (vsrc, buf_src4, q, float, f, 16, 8);
-+  vector_res = vminnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src5, float, 16, 8) [] = {B1, C1, D1, E1, F1, A1, G1, H1};
-+  VLOAD (vsrc, buf_src5, q, float, f, 16, 8);
-+  vector_res = vminnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src6, float, 16, 8) [] = {B1, C1, D1, E1, F1, G1, A1, H1};
-+  VLOAD (vsrc, buf_src6, q, float, f, 16, 8);
-+  vector_res = vminnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src7, float, 16, 8) [] = {B1, C1, D1, E1, F1, G1, H1, A1};
-+  VLOAD (vsrc, buf_src7, q, float, f, 16, 8);
-+  vector_res = vminnmvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+}
-+
-+int
-+main (void)
-+{
-+  exec_vminnmv_f16 ();
-+  return 0;
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vminv_f16_1.c
-@@ -0,0 +1,131 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_neon } */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+#define FP16_C(a) ((__fp16) a)
-+#define A0 FP16_C (-567.8)
-+#define B0 FP16_C (123.4)
-+#define C0 FP16_C (34.8)
-+#define D0 FP16_C (0.0)
-+
-+#define A1 FP16_C (-567.8)
-+#define B1 FP16_C (1025.8)
-+#define C1 FP16_C (13.4)
-+#define D1 FP16_C (10)
-+#define E1 FP16_C (-0.0)
-+#define F1 FP16_C (567.8)
-+#define G1 FP16_C (0.0)
-+#define H1 FP16_C (10)
-+
-+/* Expected results for vminv.  */
-+uint16_t expect = 0xE070 /* A0.  */;
-+uint16_t expect_alt = 0xE070 /* A1.  */;
-+
-+void exec_vminv_f16 (void)
-+{
-+#undef TEST_MSG
-+#define TEST_MSG "VMINV (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc, float, 16, 4);
-+  VECT_VAR_DECL (buf_src, float, 16, 4) [] = {A0, B0, C0, D0};
-+  VLOAD (vsrc, buf_src, , float, f, 16, 4);
-+  float16_t vector_res = vminv_f16 (VECT_VAR (vsrc, float, 16, 4));
-+
-+  if (* (uint16_t *) &vector_res != expect)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src1, float, 16, 4) [] = {B0, A0, C0, D0};
-+  VLOAD (vsrc, buf_src1, , float, f, 16, 4);
-+  vector_res = vminv_f16 (VECT_VAR (vsrc, float, 16, 4));
-+
-+  if (* (uint16_t *) &vector_res != expect)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src2, float, 16, 4) [] = {B0, C0, A0, D0};
-+  VLOAD (vsrc, buf_src2, , float, f, 16, 4);
-+  vector_res = vminv_f16 (VECT_VAR (vsrc, float, 16, 4));
-+
-+  if (* (uint16_t *) &vector_res != expect)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src3, float, 16, 4) [] = {B0, C0, D0, A0};
-+  VLOAD (vsrc, buf_src3, , float, f, 16, 4);
-+  vector_res = vminv_f16 (VECT_VAR (vsrc, float, 16, 4));
-+
-+  if (* (uint16_t *) &vector_res != expect)
-+    abort ();
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VMINVQ (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc, float, 16, 8);
-+  VECT_VAR_DECL (buf_src, float, 16, 8) [] = {A1, B1, C1, D1, E1, F1, G1, H1};
-+  VLOAD (vsrc, buf_src, q, float, f, 16, 8);
-+  vector_res = vminvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src1, float, 16, 8) [] = {B1, A1, C1, D1, E1, F1, G1, H1};
-+  VLOAD (vsrc, buf_src1, q, float, f, 16, 8);
-+  vector_res = vminvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src2, float, 16, 8) [] = {B1, C1, A1, D1, E1, F1, G1, H1};
-+  VLOAD (vsrc, buf_src2, q, float, f, 16, 8);
-+  vector_res = vminvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src3, float, 16, 8) [] = {B1, C1, D1, A1, E1, F1, G1, H1};
-+  VLOAD (vsrc, buf_src3, q, float, f, 16, 8);
-+  vector_res = vminvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src4, float, 16, 8) [] = {B1, C1, D1, E1, A1, F1, G1, H1};
-+  VLOAD (vsrc, buf_src4, q, float, f, 16, 8);
-+  vector_res = vminvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src5, float, 16, 8) [] = {B1, C1, D1, E1, F1, A1, G1, H1};
-+  VLOAD (vsrc, buf_src5, q, float, f, 16, 8);
-+  vector_res = vminvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src6, float, 16, 8) [] = {B1, C1, D1, E1, F1, G1, A1, H1};
-+  VLOAD (vsrc, buf_src6, q, float, f, 16, 8);
-+  vector_res = vminvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+
-+  VECT_VAR_DECL (buf_src7, float, 16, 8) [] = {B1, C1, D1, E1, F1, G1, H1, A1};
-+  VLOAD (vsrc, buf_src7, q, float, f, 16, 8);
-+  vector_res = vminvq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+
-+  if (* (uint16_t *) &vector_res != expect_alt)
-+    abort ();
-+}
-+
-+int
-+main (void)
-+{
-+  exec_vminv_f16 ();
-+  return 0;
-+}
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmovn.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmovn.c
-@@ -35,11 +35,11 @@ void exec_vmovn (void)
-   TEST_VMOVN(uint, u, 32, 16, 4);
-   TEST_VMOVN(uint, u, 64, 32, 2);
- 
--  CHECK(TEST_MSG, int, 8, 8, PRIx32, expected, "");
--  CHECK(TEST_MSG, int, 16, 4, PRIx64, expected, "");
-+  CHECK(TEST_MSG, int, 8, 8, PRIx8, expected, "");
-+  CHECK(TEST_MSG, int, 16, 4, PRIx16, expected, "");
-   CHECK(TEST_MSG, int, 32, 2, PRIx32, expected, "");
--  CHECK(TEST_MSG, uint, 8, 8, PRIx32, expected, "");
--  CHECK(TEST_MSG, uint, 16, 4, PRIx64, expected, "");
-+  CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected, "");
-+  CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
- }
- 
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul.c
-@@ -13,6 +13,10 @@ VECT_VAR_DECL(expected,uint,16,4) [] = { 0xfab0, 0xfb05, 0xfb5a, 0xfbaf };
- VECT_VAR_DECL(expected,uint,32,2) [] = { 0xfffff9a0, 0xfffffa06 };
- VECT_VAR_DECL(expected,poly,8,8) [] = { 0xc0, 0x84, 0x48, 0xc,
- 					0xd0, 0x94, 0x58, 0x1c };
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xe02a, 0xdfcf,
-+					      0xdf4a, 0xdec4 };
-+#endif
- VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc4053333, 0xc3f9c000 };
- VECT_VAR_DECL(expected,int,8,16) [] = { 0x90, 0x7, 0x7e, 0xf5,
- 					0x6c, 0xe3, 0x5a, 0xd1,
-@@ -34,13 +38,15 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x60, 0xca, 0x34, 0x9e,
- 					 0xc8, 0x62, 0x9c, 0x36,
- 					 0x30, 0x9a, 0x64, 0xce,
- 					 0x98, 0x32, 0xcc, 0x66 };
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0xe63a, 0xe5d6, 0xe573, 0xe50f,
-+					      0xe4ac, 0xe448, 0xe3c8, 0xe301 };
-+#endif
- VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc4c73333, 0xc4bac000,
- 					   0xc4ae4ccd, 0xc4a1d999 };
- 
--#ifndef INSN_NAME
- #define INSN_NAME vmul
- #define TEST_MSG "VMUL"
--#endif
- 
- #define FNNAME1(NAME) exec_ ## NAME
- #define FNNAME(NAME) FNNAME1(NAME)
-@@ -80,6 +86,17 @@ void FNNAME (INSN_NAME) (void)
-   DECL_VMUL(poly, 8, 16);
-   DECL_VMUL(float, 32, 4);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector1, float, 16, 4);
-+  DECL_VARIABLE(vector1, float, 16, 8);
-+
-+  DECL_VARIABLE(vector2, float, 16, 4);
-+  DECL_VARIABLE(vector2, float, 16, 8);
-+
-+  DECL_VARIABLE(vector_res, float, 16, 4);
-+  DECL_VARIABLE(vector_res, float, 16, 8);
-+#endif
-+
-   clean_results ();
- 
-   /* Initialize input "vector1" from "buffer".  */
-@@ -99,6 +116,10 @@ void FNNAME (INSN_NAME) (void)
-   VLOAD(vector1, buffer, q, uint, u, 32, 4);
-   VLOAD(vector1, buffer, q, poly, p, 8, 16);
-   VLOAD(vector1, buffer, q, float, f, 32, 4);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VLOAD(vector1, buffer, , float, f, 16, 4);
-+  VLOAD(vector1, buffer, q, float, f, 16, 8);
-+#endif
- 
-   /* Choose init value arbitrarily.  */
-   VDUP(vector2, , int, s, 8, 8, 0x11);
-@@ -117,6 +138,10 @@ void FNNAME (INSN_NAME) (void)
-   VDUP(vector2, q, uint, u, 32, 4, 0xCC);
-   VDUP(vector2, q, poly, p, 8, 16, 0xAA);
-   VDUP(vector2, q, float, f, 32, 4, 99.6f);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector2, , float, f, 16, 4, 33.3f);
-+  VDUP(vector2, q, float, f, 16, 8, 99.6f);
-+#endif
- 
-   /* Execute the tests.  */
-   TEST_VMUL(INSN_NAME, , int, s, 8, 8);
-@@ -135,6 +160,10 @@ void FNNAME (INSN_NAME) (void)
-   TEST_VMUL(INSN_NAME, q, uint, u, 32, 4);
-   TEST_VMUL(INSN_NAME, q, poly, p, 8, 16);
-   TEST_VMUL(INSN_NAME, q, float, f, 32, 4);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VMUL(INSN_NAME, , float, f, 16, 4);
-+  TEST_VMUL(INSN_NAME, q, float, f, 16, 8);
-+#endif
- 
-   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected, "");
-   CHECK(TEST_MSG, int, 16, 4, PRIx16, expected, "");
-@@ -142,7 +171,7 @@ void FNNAME (INSN_NAME) (void)
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected, "");
-   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, "");
-   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected, "");
-   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
-@@ -150,8 +179,12 @@ void FNNAME (INSN_NAME) (void)
-   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected, "");
-   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
--  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected, "");
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, "");
-+#endif
- }
- 
- int main (void)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul_lane.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul_lane.c
-@@ -7,6 +7,9 @@ VECT_VAR_DECL(expected,int,16,4) [] = { 0xffc0, 0xffc4, 0xffc8, 0xffcc };
- VECT_VAR_DECL(expected,int,32,2) [] = { 0xfffffde0, 0xfffffe02 };
- VECT_VAR_DECL(expected,uint,16,4) [] = { 0xbbc0, 0xc004, 0xc448, 0xc88c };
- VECT_VAR_DECL(expected,uint,32,2) [] = { 0xfffface0, 0xffffb212 };
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xddb3, 0xdd58, 0xdcfd, 0xdca1 };
-+#endif
- VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc3b66666, 0xc3ab0000 };
- VECT_VAR_DECL(expected,int,16,8) [] = { 0xffc0, 0xffc4, 0xffc8, 0xffcc,
- 					0xffd0, 0xffd4, 0xffd8, 0xffdc };
-@@ -16,6 +19,10 @@ VECT_VAR_DECL(expected,uint,16,8) [] = { 0xbbc0, 0xc004, 0xc448, 0xc88c,
- 					 0xccd0, 0xd114, 0xd558, 0xd99c };
- VECT_VAR_DECL(expected,uint,32,4) [] = { 0xfffface0, 0xffffb212,
- 					 0xffffb744, 0xffffbc76 };
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0xddb3, 0xdd58, 0xdcfd, 0xdca1,
-+					      0xdc46, 0xdbd6, 0xdb20, 0xda69 };
-+#endif
- VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc3b66666, 0xc3ab0000,
- 					   0xc39f9999, 0xc3943333 };
- 
-@@ -45,11 +52,20 @@ void exec_vmul_lane (void)
- 
-   DECL_VMUL(vector);
-   DECL_VMUL(vector_res);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector, float, 16, 4);
-+  DECL_VARIABLE(vector, float, 16, 8);
-+  DECL_VARIABLE(vector_res, float, 16, 4);
-+  DECL_VARIABLE(vector_res, float, 16, 8);
-+#endif
- 
-   DECL_VARIABLE(vector2, int, 16, 4);
-   DECL_VARIABLE(vector2, int, 32, 2);
-   DECL_VARIABLE(vector2, uint, 16, 4);
-   DECL_VARIABLE(vector2, uint, 32, 2);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector2, float, 16, 4);
-+#endif
-   DECL_VARIABLE(vector2, float, 32, 2);
- 
-   clean_results ();
-@@ -59,11 +75,17 @@ void exec_vmul_lane (void)
-   VLOAD(vector, buffer, , int, s, 32, 2);
-   VLOAD(vector, buffer, , uint, u, 16, 4);
-   VLOAD(vector, buffer, , uint, u, 32, 2);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VLOAD(vector, buffer, , float, f, 16, 4);
-+#endif
-   VLOAD(vector, buffer, , float, f, 32, 2);
-   VLOAD(vector, buffer, q, int, s, 16, 8);
-   VLOAD(vector, buffer, q, int, s, 32, 4);
-   VLOAD(vector, buffer, q, uint, u, 16, 8);
-   VLOAD(vector, buffer, q, uint, u, 32, 4);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VLOAD(vector, buffer, q, float, f, 16, 8);
-+#endif
-   VLOAD(vector, buffer, q, float, f, 32, 4);
- 
-   /* Initialize vector2.  */
-@@ -71,6 +93,9 @@ void exec_vmul_lane (void)
-   VDUP(vector2, , int, s, 32, 2, 0x22);
-   VDUP(vector2, , uint, u, 16, 4, 0x444);
-   VDUP(vector2, , uint, u, 32, 2, 0x532);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector2, , float, f, 16, 4, 22.8f);
-+#endif
-   VDUP(vector2, , float, f, 32, 2, 22.8f);
- 
-   /* Choose lane arbitrarily.  */
-@@ -78,22 +103,34 @@ void exec_vmul_lane (void)
-   TEST_VMUL_LANE(, int, s, 32, 2, 2, 1);
-   TEST_VMUL_LANE(, uint, u, 16, 4, 4, 2);
-   TEST_VMUL_LANE(, uint, u, 32, 2, 2, 1);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VMUL_LANE(, float, f, 16, 4, 4, 1);
-+#endif
-   TEST_VMUL_LANE(, float, f, 32, 2, 2, 1);
-   TEST_VMUL_LANE(q, int, s, 16, 8, 4, 2);
-   TEST_VMUL_LANE(q, int, s, 32, 4, 2, 0);
-   TEST_VMUL_LANE(q, uint, u, 16, 8, 4, 2);
-   TEST_VMUL_LANE(q, uint, u, 32, 4, 2, 1);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VMUL_LANE(q, float, f, 16, 8, 4, 0);
-+#endif
-   TEST_VMUL_LANE(q, float, f, 32, 4, 2, 0);
- 
--  CHECK(TEST_MSG, int, 16, 4, PRIx64, expected, "");
-+  CHECK(TEST_MSG, int, 16, 4, PRIx16, expected, "");
-   CHECK(TEST_MSG, int, 32, 2, PRIx32, expected, "");
--  CHECK(TEST_MSG, uint, 16, 4, PRIx64, expected, "");
-+  CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, "");
--  CHECK(TEST_MSG, int, 16, 8, PRIx64, expected, "");
-+  CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
-   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
--  CHECK(TEST_MSG, uint, 16, 8, PRIx64, expected, "");
-+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, "");
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected, "");
- }
- 
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul_lane_f16_1.c
-@@ -0,0 +1,454 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_neon } */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+#define FP16_C(a) ((__fp16) a)
-+#define A FP16_C (13.4)
-+#define B FP16_C (-56.8)
-+#define C FP16_C (-34.8)
-+#define D FP16_C (12)
-+#define E FP16_C (63.1)
-+#define F FP16_C (19.1)
-+#define G FP16_C (-4.8)
-+#define H FP16_C (77)
-+
-+#define I FP16_C (0.7)
-+#define J FP16_C (-78)
-+#define K FP16_C (11.23)
-+#define L FP16_C (98)
-+#define M FP16_C (87.1)
-+#define N FP16_C (-8)
-+#define O FP16_C (-1.1)
-+#define P FP16_C (-9.7)
-+
-+/* Expected results for vmul_lane.  */
-+VECT_VAR_DECL (expected0_static, hfloat, 16, 4) []
-+  = { 0x629B /* A * E.  */,
-+      0xEB00 /* B * E.  */,
-+      0xE84A /* C * E.  */,
-+      0x61EA /* D * E.  */ };
-+
-+VECT_VAR_DECL (expected1_static, hfloat, 16, 4) []
-+  = { 0x5BFF /* A * F.  */,
-+      0xE43D /* B * F.  */,
-+      0xE131 /* C * F.  */,
-+      0x5B29 /* D * F.  */ };
-+
-+VECT_VAR_DECL (expected2_static, hfloat, 16, 4) []
-+  = { 0xD405 /* A * G.  */,
-+      0x5C43 /* B * G.  */,
-+      0x5939 /* C * G.  */,
-+      0xD334 /* D * G.  */ };
-+
-+VECT_VAR_DECL (expected3_static, hfloat, 16, 4) []
-+  = { 0x6408 /* A * H.  */,
-+      0xEC46 /* B * H.  */,
-+      0xE93C /* C * H.  */,
-+      0x6338 /* D * H.  */ };
-+
-+/* Expected results for vmulq_lane.  */
-+VECT_VAR_DECL (expected0_static, hfloat, 16, 8) []
-+  = { 0x629B /* A * E.  */,
-+      0xEB00 /* B * E.  */,
-+      0xE84A /* C * E.  */,
-+      0x61EA /* D * E.  */,
-+      0x5186 /* I * E.  */,
-+      0xECCE /* J * E.  */,
-+      0x6189 /* K * E.  */,
-+      0x6E0A /* L * E.  */ };
-+
-+VECT_VAR_DECL (expected1_static, hfloat, 16, 8) []
-+  = { 0x5BFF /* A * F.  */,
-+      0xE43D /* B * F.  */,
-+      0xE131 /* C * F.  */,
-+      0x5B29 /* D * F.  */,
-+      0x4AAF /* I * F.  */,
-+      0xE5D1 /* J * F.  */,
-+      0x5AB3 /* K * F.  */,
-+      0x674F /* L * F.  */ };
-+
-+VECT_VAR_DECL (expected2_static, hfloat, 16, 8) []
-+  = { 0xD405 /* A * G.  */,
-+      0x5C43 /* B * G.  */,
-+      0x5939 /* C * G.  */,
-+      0xD334 /* D * G.  */,
-+      0xC2B9 /* I * G.  */,
-+      0x5DDA /* J * G.  */,
-+      0xD2BD /* K * G.  */,
-+      0xDF5A /* L * G.  */ };
-+
-+VECT_VAR_DECL (expected3_static, hfloat, 16, 8) []
-+  = { 0x6408 /* A * H.  */,
-+      0xEC46 /* B * H.  */,
-+      0xE93C /* C * H.  */,
-+      0x6338 /* D * H.  */,
-+      0x52BD /* I * H.  */,
-+      0xEDDE /* J * H.  */,
-+      0x62C1 /* K * H.  */,
-+      0x6F5E /* L * H.  */ };
-+
-+/* Expected results for vmul_laneq.  */
-+VECT_VAR_DECL (expected_laneq0_static, hfloat, 16, 4) []
-+  = { 0x629B /* A * E.  */,
-+      0xEB00 /* B * E.  */,
-+      0xE84A /* C * E.  */,
-+      0x61EA /* D * E.  */ };
-+
-+VECT_VAR_DECL (expected_laneq1_static, hfloat, 16, 4) []
-+  = { 0x5BFF /* A * F.  */,
-+      0xE43D /* B * F.  */,
-+      0xE131 /* C * F.  */,
-+      0x5B29 /* D * F.  */ };
-+
-+VECT_VAR_DECL (expected_laneq2_static, hfloat, 16, 4) []
-+  = { 0xD405 /* A * G.  */,
-+      0x5C43 /* B * G.  */,
-+      0x5939 /* C * G.  */,
-+      0xD334 /* D * G.  */ };
-+
-+VECT_VAR_DECL (expected_laneq3_static, hfloat, 16, 4) []
-+  = { 0x6408 /* A * H.  */,
-+      0xEC46 /* B * H.  */,
-+      0xE93C /* C * H.  */,
-+      0x6338 /* D * H.  */ };
-+
-+VECT_VAR_DECL (expected_laneq4_static, hfloat, 16, 4) []
-+  = { 0x648F /* A * M.  */,
-+      0xECD5 /* B * M.  */,
-+      0xE9ED /* C * M.  */,
-+      0x6416 /* D * M.  */ };
-+
-+VECT_VAR_DECL (expected_laneq5_static, hfloat, 16, 4) []
-+  = { 0xD6B3 /* A * N.  */,
-+      0x5F1A /* B * N.  */,
-+      0x5C5A /* C * N.  */,
-+      0xD600 /* D * N.  */ };
-+
-+VECT_VAR_DECL (expected_laneq6_static, hfloat, 16, 4) []
-+  = { 0xCB5E /* A * O.  */,
-+      0x53CF /* B * O.  */,
-+      0x50C9 /* C * O.  */,
-+      0xCA99 /* D * O.  */ };
-+
-+VECT_VAR_DECL (expected_laneq7_static, hfloat, 16, 4) []
-+  = { 0xD810 /* A * P.  */,
-+      0x604F /* B * P.  */,
-+      0x5D47 /* C * P.  */,
-+      0xD747 /* D * P.  */ };
-+
-+/* Expected results for vmulq_laneq.  */
-+VECT_VAR_DECL (expected_laneq0_static, hfloat, 16, 8) []
-+  = { 0x629B /* A * E.  */,
-+      0xEB00 /* B * E.  */,
-+      0xE84A /* C * E.  */,
-+      0x61EA /* D * E.  */,
-+      0x5186 /* I * E.  */,
-+      0xECCE /* J * E.  */,
-+      0x6189 /* K * E.  */,
-+      0x6E0A /* L * E.  */ };
-+
-+VECT_VAR_DECL (expected_laneq1_static, hfloat, 16, 8) []
-+  = { 0x5BFF /* A * F.  */,
-+      0xE43D /* B * F.  */,
-+      0xE131 /* C * F.  */,
-+      0x5B29 /* D * F.  */,
-+      0x4AAF /* I * F.  */,
-+      0xE5D1 /* J * F.  */,
-+      0x5AB3 /* K * F.  */,
-+      0x674F /* L * F.  */ };
-+
-+VECT_VAR_DECL (expected_laneq2_static, hfloat, 16, 8) []
-+  = { 0xD405 /* A * G.  */,
-+      0x5C43 /* B * G.  */,
-+      0x5939 /* C * G.  */,
-+      0xD334 /* D * G.  */,
-+      0xC2B9 /* I * G.  */,
-+      0x5DDA /* J * G.  */,
-+      0xD2BD /* K * G.  */,
-+      0xDF5A /* L * G.  */ };
-+
-+VECT_VAR_DECL (expected_laneq3_static, hfloat, 16, 8) []
-+  = { 0x6408 /* A * H.  */,
-+      0xEC46 /* B * H.  */,
-+      0xE93C /* C * H.  */,
-+      0x6338 /* D * H.  */,
-+      0x52BD /* I * H.  */,
-+      0xEDDE /* J * H.  */,
-+      0x62C1 /* K * H.  */,
-+      0x6F5E /* L * H.  */ };
-+
-+VECT_VAR_DECL (expected_laneq4_static, hfloat, 16, 8) []
-+  = { 0x648F /* A * M.  */,
-+      0xECD5 /* B * M.  */,
-+      0xE9ED /* C * M.  */,
-+      0x6416 /* D * M.  */,
-+      0x53A0 /* I * M.  */,
-+      0xEEA3 /* J * M.  */,
-+      0x63A4 /* K * M.  */,
-+      0x702B /* L * M.  */ };
-+
-+VECT_VAR_DECL (expected_laneq5_static, hfloat, 16, 8) []
-+  = { 0xD6B3 /* A * N.  */,
-+      0x5F1A /* B * N.  */,
-+      0x5C5A /* C * N.  */,
-+      0xD600 /* D * N.  */,
-+      0xC59A /* I * N.  */,
-+      0x60E0 /* J * N.  */,
-+      0xD59D /* K * N.  */,
-+      0xE220 /* L * N.  */ };
-+
-+VECT_VAR_DECL (expected_laneq6_static, hfloat, 16, 8) []
-+  = { 0xCB5E /* A * O.  */,
-+      0x53CF /* B * O.  */,
-+      0x50C9 /* C * O.  */,
-+      0xCA99 /* D * O.  */,
-+      0xBA29 /* I * O.  */,
-+      0x555C /* J * O.  */,
-+      0xCA2C /* K * O.  */,
-+      0xD6BC /* L * O.  */ };
-+
-+VECT_VAR_DECL (expected_laneq7_static, hfloat, 16, 8) []
-+  = { 0xD810 /* A * P.  */,
-+      0x604F /* B * P.  */,
-+      0x5D47 /* C * P.  */,
-+      0xD747 /* D * P.  */,
-+      0xC6CB /* I * P.  */,
-+      0x61EA /* J * P.  */,
-+      0xD6CF /* K * P.  */,
-+      0xE36E /* L * P.  */ };
-+
-+void exec_vmul_lane_f16 (void)
-+{
-+#undef TEST_MSG
-+#define TEST_MSG "VMUL_LANE (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc_1, float, 16, 4);
-+  DECL_VARIABLE(vsrc_2, float, 16, 4);
-+  VECT_VAR_DECL (buf_src_1, float, 16, 4) [] = {A, B, C, D};
-+  VECT_VAR_DECL (buf_src_2, float, 16, 4) [] = {E, F, G, H};
-+  VLOAD (vsrc_1, buf_src_1, , float, f, 16, 4);
-+  VLOAD (vsrc_2, buf_src_2, , float, f, 16, 4);
-+  DECL_VARIABLE (vector_res, float, 16, 4)
-+    = vmul_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		     VECT_VAR (vsrc_2, float, 16, 4), 0);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected0_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmul_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		     VECT_VAR (vsrc_2, float, 16, 4), 1);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected1_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmul_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		     VECT_VAR (vsrc_2, float, 16, 4), 2);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected2_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmul_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		     VECT_VAR (vsrc_2, float, 16, 4), 3);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected3_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VMULQ_LANE (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc_1, float, 16, 8);
-+  VECT_VAR_DECL (buf_src_1, float, 16, 8) [] = {A, B, C, D, I, J, K, L};
-+  VLOAD (vsrc_1, buf_src_1, q, float, f, 16, 8);
-+  DECL_VARIABLE (vector_res, float, 16, 8)
-+    = vmulq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		      VECT_VAR (vsrc_2, float, 16, 4), 0);
-+
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected0_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		      VECT_VAR (vsrc_2, float, 16, 4), 1);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected1_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		      VECT_VAR (vsrc_2, float, 16, 4), 2);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected2_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		      VECT_VAR (vsrc_2, float, 16, 4), 3);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected3_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VMUL_LANEQ (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc_2, float, 16, 8);
-+  VECT_VAR_DECL (buf_src_2, float, 16, 8) [] = {E, F, G, H, M, N, O, P};
-+  VLOAD (vsrc_2, buf_src_2, q, float, f, 16, 8);
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmul_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 8), 0);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq0_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmul_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 8), 1);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq1_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmul_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 8), 2);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq2_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmul_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 8), 3);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq3_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmul_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 8), 4);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq4_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmul_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 8), 5);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq5_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmul_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 8), 6);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq6_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmul_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 8), 7);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq7_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VMULQ_LANEQ (FP16)"
-+  clean_results ();
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 8), 0);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq0_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 8), 1);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq1_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 8), 2);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq2_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 8), 3);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq3_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 8), 4);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq4_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 8), 5);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq5_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 8), 6);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq6_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 8), 7);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq7_static, "");
-+}
-+
-+int
-+main (void)
-+{
-+  exec_vmul_lane_f16 ();
-+  return 0;
-+}
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul_n.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul_n.c
-@@ -7,6 +7,9 @@ VECT_VAR_DECL(expected,int,16,4) [] = { 0xfef0, 0xff01, 0xff12, 0xff23 };
- VECT_VAR_DECL(expected,int,32,2) [] = { 0xfffffde0, 0xfffffe02 };
- VECT_VAR_DECL(expected,uint,16,4) [] = { 0xfcd0, 0xfd03, 0xfd36, 0xfd69 };
- VECT_VAR_DECL(expected,uint,32,2) [] = { 0xfffffbc0, 0xfffffc04 };
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xdd93, 0xdd3a, 0xdce1, 0xdc87 };
-+#endif
- VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc3b26666, 0xc3a74000 };
- VECT_VAR_DECL(expected,int,16,8) [] = { 0xfab0, 0xfb05, 0xfb5a, 0xfbaf,
- 					0xfc04, 0xfc59, 0xfcae, 0xfd03 };
-@@ -16,6 +19,10 @@ VECT_VAR_DECL(expected,uint,16,8) [] = { 0xf890, 0xf907, 0xf97e, 0xf9f5,
- 					 0xfa6c, 0xfae3, 0xfb5a, 0xfbd1 };
- VECT_VAR_DECL(expected,uint,32,4) [] = { 0xfffff780, 0xfffff808,
- 					 0xfffff890, 0xfffff918 };
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0xe58e, 0xe535, 0xe4dc, 0xe483,
-+					      0xe42a, 0xe3a3, 0xe2f2, 0xe240 };
-+#endif
- VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc4b1cccd, 0xc4a6b000,
- 					   0xc49b9333, 0xc4907667 };
- 
-@@ -50,6 +57,13 @@ void FNNAME (INSN_NAME) (void)
-   DECL_VMUL(vector);
-   DECL_VMUL(vector_res);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector, float, 16, 4);
-+  DECL_VARIABLE(vector, float, 16, 8);
-+  DECL_VARIABLE(vector_res, float, 16, 4);
-+  DECL_VARIABLE(vector_res, float, 16, 8);
-+#endif
-+
-   clean_results ();
- 
-   /* Initialize vector from pre-initialized values.  */
-@@ -57,11 +71,17 @@ void FNNAME (INSN_NAME) (void)
-   VLOAD(vector, buffer, , int, s, 32, 2);
-   VLOAD(vector, buffer, , uint, u, 16, 4);
-   VLOAD(vector, buffer, , uint, u, 32, 2);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VLOAD(vector, buffer, , float, f, 16, 4);
-+#endif
-   VLOAD(vector, buffer, , float, f, 32, 2);
-   VLOAD(vector, buffer, q, int, s, 16, 8);
-   VLOAD(vector, buffer, q, int, s, 32, 4);
-   VLOAD(vector, buffer, q, uint, u, 16, 8);
-   VLOAD(vector, buffer, q, uint, u, 32, 4);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VLOAD(vector, buffer, q, float, f, 16, 8);
-+#endif
-   VLOAD(vector, buffer, q, float, f, 32, 4);
- 
-   /* Choose multiplier arbitrarily.  */
-@@ -69,22 +89,34 @@ void FNNAME (INSN_NAME) (void)
-   TEST_VMUL_N(, int, s, 32, 2, 0x22);
-   TEST_VMUL_N(, uint, u, 16, 4, 0x33);
-   TEST_VMUL_N(, uint, u, 32, 2, 0x44);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VMUL_N(, float, f, 16, 4, 22.3f);
-+#endif
-   TEST_VMUL_N(, float, f, 32, 2, 22.3f);
-   TEST_VMUL_N(q, int, s, 16, 8, 0x55);
-   TEST_VMUL_N(q, int, s, 32, 4, 0x66);
-   TEST_VMUL_N(q, uint, u, 16, 8, 0x77);
-   TEST_VMUL_N(q, uint, u, 32, 4, 0x88);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VMUL_N(q, float, f, 16, 8, 88.9f);
-+#endif
-   TEST_VMUL_N(q, float, f, 32, 4, 88.9f);
- 
--  CHECK(TEST_MSG, int, 16, 4, PRIx64, expected, "");
-+  CHECK(TEST_MSG, int, 16, 4, PRIx16, expected, "");
-   CHECK(TEST_MSG, int, 32, 2, PRIx32, expected, "");
--  CHECK(TEST_MSG, uint, 16, 4, PRIx64, expected, "");
-+  CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, "");
--  CHECK(TEST_MSG, int, 16, 8, PRIx64, expected, "");
-+  CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
-   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
--  CHECK(TEST_MSG, uint, 16, 8, PRIx64, expected, "");
-+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, "");
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected, "");
- }
- 
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmulh_f16_1.c
-@@ -0,0 +1,42 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+#define INFF __builtin_inf ()
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected[] =
-+{
-+  0x0000 /* 0.000000 */,
-+  0x8000 /* -0.000000 */,
-+  0xc854 /* -8.656250 */,
-+  0x5cd8 /* 310.000000 */,
-+  0x60b0 /* 600.000000 */,
-+  0xa019 /* -0.008003 */,
-+  0xbc9a /* -1.150391 */,
-+  0xc8cf /* -9.617188 */,
-+  0x51fd /* 47.906250 */,
-+  0x4634 /* 6.203125 */,
-+  0xc0d9 /* -2.423828 */,
-+  0x3c9a /* 1.150391 */,
-+  0xc79a /* -7.601562 */,
-+  0x5430 /* 67.000000 */,
-+  0xbfd0 /* -1.953125 */,
-+  0x46ac /* 6.671875 */,
-+  0xfc00 /* -inf */,
-+  0xfc00 /* -inf */
-+};
-+
-+#define TEST_MSG "VMULH_F16"
-+#define INSN_NAME vmulh_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "binary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmulh_lane_f16_1.c
-@@ -0,0 +1,90 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_neon } */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+#define FP16_C(a) ((__fp16) a)
-+#define A FP16_C (13.4)
-+#define B FP16_C (-56.8)
-+#define C FP16_C (-34.8)
-+#define D FP16_C (12)
-+#define E FP16_C (63.1)
-+#define F FP16_C (19.1)
-+#define G FP16_C (-4.8)
-+#define H FP16_C (77)
-+
-+#define I FP16_C (0.7)
-+#define J FP16_C (-78)
-+#define K FP16_C (11.23)
-+#define L FP16_C (98)
-+#define M FP16_C (87.1)
-+#define N FP16_C (-8)
-+#define O FP16_C (-1.1)
-+#define P FP16_C (-9.7)
-+
-+extern void abort ();
-+
-+float16_t src1[8] = { A, B, C, D, I, J, K, L };
-+VECT_VAR_DECL (src2, float, 16, 4) [] = { E, F, G, H };
-+VECT_VAR_DECL (src2, float, 16, 8) [] = { E, F, G, H, M, N, O, P };
-+
-+/* Expected results for vmulh_lane.  */
-+uint16_t expected[4] = { 0x629B /* A * E.  */, 0xE43D /* B * F.  */,
-+			 0x5939 /* C * G.  */, 0x6338 /* D * H.  */ };
-+
-+
-+/* Expected results for vmulh_lane.  */
-+uint16_t expected_laneq[8] = { 0x629B /* A * E.  */,
-+			       0xE43D /* B * F.  */,
-+			       0x5939 /* C * G.  */,
-+			       0x6338 /* D * H.  */,
-+			       0x53A0 /* I * M.  */,
-+			       0x60E0 /* J * N.  */,
-+			       0xCA2C /* K * O.  */,
-+			       0xE36E /* L * P.  */ };
-+
-+void exec_vmulh_lane_f16 (void)
-+{
-+#define CHECK_LANE(N)\
-+  ret = vmulh_lane_f16 (src1[N], VECT_VAR (vsrc2, float, 16, 4), N);\
-+  if (*(uint16_t *) &ret != expected[N])\
-+    abort ();
-+
-+  DECL_VARIABLE(vsrc2, float, 16, 4);
-+  VLOAD (vsrc2, src2, , float, f, 16, 4);
-+  float16_t ret;
-+
-+  CHECK_LANE(0)
-+  CHECK_LANE(1)
-+  CHECK_LANE(2)
-+  CHECK_LANE(3)
-+
-+#undef CHECK_LANE
-+#define CHECK_LANE(N)\
-+  ret = vmulh_laneq_f16 (src1[N], VECT_VAR (vsrc2, float, 16, 8), N);\
-+  if (*(uint16_t *) &ret != expected_laneq[N])\
-+    abort ();
-+
-+  DECL_VARIABLE(vsrc2, float, 16, 8);
-+  VLOAD (vsrc2, src2, q, float, f, 16, 8);
-+
-+  CHECK_LANE(0)
-+  CHECK_LANE(1)
-+  CHECK_LANE(2)
-+  CHECK_LANE(3)
-+  CHECK_LANE(4)
-+  CHECK_LANE(5)
-+  CHECK_LANE(6)
-+  CHECK_LANE(7)
-+}
-+
-+int
-+main (void)
-+{
-+  exec_vmulh_lane_f16 ();
-+  return 0;
-+}
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull.c
-@@ -59,13 +59,13 @@ void exec_vmull (void)
-   TEST_VMULL(uint, u, 32, 64, 2);
-   TEST_VMULL(poly, p, 8, 16, 8);
- 
--  CHECK(TEST_MSG, int, 16, 8, PRIx64, expected, "");
-+  CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
-   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
--  CHECK(TEST_MSG, int, 64, 2, PRIx32, expected, "");
--  CHECK(TEST_MSG, uint, 16, 8, PRIx64, expected, "");
-+  CHECK(TEST_MSG, int, 64, 2, PRIx64, expected, "");
-+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
--  CHECK(TEST_MSG, uint, 64, 2, PRIx32, expected, "");
--  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected, "");
-+  CHECK(TEST_MSG, uint, 64, 2, PRIx64, expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected, "");
- }
- 
- int main (void)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_lane.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_lane.c
-@@ -54,9 +54,9 @@ void exec_vmull_lane (void)
-   TEST_VMULL_LANE(uint, u, 32, 64, 2, 1);
- 
-   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
--  CHECK(TEST_MSG, int, 64, 2, PRIx32, expected, "");
-+  CHECK(TEST_MSG, int, 64, 2, PRIx64, expected, "");
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
--  CHECK(TEST_MSG, uint, 64, 2, PRIx32, expected, "");
-+  CHECK(TEST_MSG, uint, 64, 2, PRIx64, expected, "");
- }
- 
- int main (void)
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmulx_f16_1.c
-@@ -0,0 +1,84 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_neon } */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+#define FP16_C(a) ((__fp16) a)
-+#define A FP16_C (13.4)
-+#define B FP16_C (__builtin_inff ())
-+#define C FP16_C (-34.8)
-+#define D FP16_C (-__builtin_inff ())
-+#define E FP16_C (63.1)
-+#define F FP16_C (0.0)
-+#define G FP16_C (-4.8)
-+#define H FP16_C (0.0)
-+
-+#define I FP16_C (0.7)
-+#define J FP16_C (-__builtin_inff ())
-+#define K FP16_C (11.23)
-+#define L FP16_C (98)
-+#define M FP16_C (87.1)
-+#define N FP16_C (-0.0)
-+#define O FP16_C (-1.1)
-+#define P FP16_C (7)
-+
-+/* Expected results for vmulx.  */
-+VECT_VAR_DECL (expected_static, hfloat, 16, 4) []
-+  = { 0x629B /* A * E.  */, 0x4000 /* FP16_C (2.0f).  */,
-+      0x5939 /* C * G.  */, 0xC000 /* FP16_C (-2.0f).  */ };
-+
-+VECT_VAR_DECL (expected_static, hfloat, 16, 8) []
-+  = { 0x629B /* A * E.  */, 0x4000 /* FP16_C (2.0f).  */,
-+      0x5939 /* C * G.  */, 0xC000 /* FP16_C (-2.0f).  */,
-+      0x53A0 /* I * M.  */, 0x4000 /* FP16_C (2.0f).  */,
-+      0xCA2C /* K * O.  */, 0x615C /* L * P.  */ };
-+
-+void exec_vmulx_f16 (void)
-+{
-+#undef TEST_MSG
-+#define TEST_MSG "VMULX (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc_1, float, 16, 4);
-+  DECL_VARIABLE(vsrc_2, float, 16, 4);
-+  VECT_VAR_DECL (buf_src_1, float, 16, 4) [] = {A, B, C, D};
-+  VECT_VAR_DECL (buf_src_2, float, 16, 4) [] = {E, F, G, H};
-+  VLOAD (vsrc_1, buf_src_1, , float, f, 16, 4);
-+  VLOAD (vsrc_2, buf_src_2, , float, f, 16, 4);
-+  DECL_VARIABLE (vector_res, float, 16, 4)
-+    = vmulx_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		 VECT_VAR (vsrc_2, float, 16, 4));
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VMULXQ (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc_1, float, 16, 8);
-+  DECL_VARIABLE(vsrc_2, float, 16, 8);
-+  VECT_VAR_DECL (buf_src_1, float, 16, 8) [] = {A, B, C, D, I, J, K, L};
-+  VECT_VAR_DECL (buf_src_2, float, 16, 8) [] = {E, F, G, H, M, N, O, P};
-+  VLOAD (vsrc_1, buf_src_1, q, float, f, 16, 8);
-+  VLOAD (vsrc_2, buf_src_2, q, float, f, 16, 8);
-+  DECL_VARIABLE (vector_res, float, 16, 8)
-+    = vmulxq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		  VECT_VAR (vsrc_2, float, 16, 8));
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_static, "");
-+}
-+
-+int
-+main (void)
-+{
-+  exec_vmulx_f16 ();
-+  return 0;
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmulx_lane_f16_1.c
-@@ -0,0 +1,452 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_neon } */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+#define FP16_C(a) ((__fp16) a)
-+#define A FP16_C (13.4)
-+#define B FP16_C (__builtin_inff ())
-+#define C FP16_C (-34.8)
-+#define D FP16_C (-__builtin_inff ())
-+#define E FP16_C (-0.0)
-+#define F FP16_C (19.1)
-+#define G FP16_C (-4.8)
-+#define H FP16_C (0.0)
-+
-+#define I FP16_C (0.7)
-+#define J FP16_C (-78)
-+#define K FP16_C (-__builtin_inff ())
-+#define L FP16_C (98)
-+#define M FP16_C (87.1)
-+#define N FP16_C (-8)
-+#define O FP16_C (-1.1)
-+#define P FP16_C (-0.0)
-+
-+/* Expected results for vmulx_lane.  */
-+VECT_VAR_DECL (expected0_static, hfloat, 16, 4) []
-+  = { 0x8000 /* A * E.  */,
-+      0xC000 /* FP16_C (-2.0f).  */,
-+      0x0000 /* C * E.  */,
-+      0x4000 /* FP16_C (2.0f).  */ };
-+
-+VECT_VAR_DECL (expected1_static, hfloat, 16, 4) []
-+  = { 0x5BFF /* A * F.  */,
-+      0x7C00 /* B * F.  */,
-+      0xE131 /* C * F.  */,
-+      0xFC00 /* D * F.  */ };
-+
-+VECT_VAR_DECL (expected2_static, hfloat, 16, 4) []
-+  = { 0xD405 /* A * G.  */,
-+      0xFC00 /* B * G.  */,
-+      0x5939 /* C * G.  */,
-+      0x7C00 /* D * G.  */ };
-+
-+VECT_VAR_DECL (expected3_static, hfloat, 16, 4) []
-+  = { 0x0000 /* A * H.  */,
-+      0x4000 /* FP16_C (2.0f).  */,
-+      0x8000 /* C * H.  */,
-+      0xC000 /* FP16_C (-2.0f).  */ };
-+
-+/* Expected results for vmulxq_lane.  */
-+VECT_VAR_DECL (expected0_static, hfloat, 16, 8) []
-+  = { 0x8000 /* A * E.  */,
-+      0xC000 /* FP16_C (-2.0f).  */,
-+      0x0000 /* C * E.  */,
-+      0x4000 /* FP16_C (2.0f).  */,
-+      0x8000 /* I * E.  */,
-+      0x0000 /* J * E.  */,
-+      0x4000 /* FP16_C (2.0f).  */,
-+      0x8000 /* L * E.  */ };
-+
-+VECT_VAR_DECL (expected1_static, hfloat, 16, 8) []
-+  = { 0x5BFF /* A * F.  */,
-+      0x7C00 /* B * F.  */,
-+      0xE131 /* C * F.  */,
-+      0xFC00 /* D * F.  */,
-+      0x4AAF /* I * F.  */,
-+      0xE5D1 /* J * F.  */,
-+      0xFC00 /* K * F.  */,
-+      0x674F /* L * F.  */ };
-+
-+VECT_VAR_DECL (expected2_static, hfloat, 16, 8) []
-+  = { 0xD405 /* A * G.  */,
-+      0xFC00 /* B * G.  */,
-+      0x5939 /* C * G.  */,
-+      0x7C00 /* D * G.  */,
-+      0xC2B9 /* I * G.  */,
-+      0x5DDA /* J * G.  */,
-+      0x7C00 /* K * G.  */,
-+      0xDF5A /* L * G.  */ };
-+
-+VECT_VAR_DECL (expected3_static, hfloat, 16, 8) []
-+  = { 0x0000 /* A * H.  */,
-+      0x4000 /* FP16_C (2.0f).  */,
-+      0x8000 /* C * H.  */,
-+      0xC000 /* FP16_C (-2.0f).  */,
-+      0x0000 /* I * H.  */,
-+      0x8000 /* J * H.  */,
-+      0xC000 /* FP16_C (-2.0f).  */,
-+      0x0000 /* L * H.  */};
-+
-+/* Expected results for vmulx_laneq.  */
-+VECT_VAR_DECL (expected_laneq0_static, hfloat, 16, 4) []
-+  = { 0x8000 /* A * E.  */,
-+      0xC000 /* FP16_C (-2.0f).  */,
-+      0x0000 /* C * E.  */,
-+      0x4000 /* FP16_C (2.0f).  */ };
-+
-+VECT_VAR_DECL (expected_laneq1_static, hfloat, 16, 4) []
-+  = { 0x5BFF /* A * F.  */,
-+      0x7C00 /* B * F.  */,
-+      0xE131 /* C * F.  */,
-+      0xFC00 /* D * F.  */ };
-+
-+VECT_VAR_DECL (expected_laneq2_static, hfloat, 16, 4) []
-+  = { 0xD405 /* A * G.  */,
-+      0xFC00 /* B * G.  */,
-+      0x5939 /* C * G.  */,
-+      0x7C00 /* D * G.  */ };
-+
-+VECT_VAR_DECL (expected_laneq3_static, hfloat, 16, 4) []
-+  = { 0x0000 /* A * H.  */,
-+      0x4000 /* FP16_C (2.0f).  */,
-+      0x8000 /* C * H.  */,
-+      0xC000 /* FP16_C (-2.0f).  */ };
-+
-+VECT_VAR_DECL (expected_laneq4_static, hfloat, 16, 4) []
-+  = { 0x648F /* A * M.  */,
-+      0x7C00 /* B * M.  */,
-+      0xE9ED /* C * M.  */,
-+      0xFC00 /* D * M.  */ };
-+
-+VECT_VAR_DECL (expected_laneq5_static, hfloat, 16, 4) []
-+  = { 0xD6B3 /* A * N.  */,
-+      0xFC00 /* B * N.  */,
-+      0x5C5A /* C * N.  */,
-+      0x7C00 /* D * N.  */ };
-+
-+VECT_VAR_DECL (expected_laneq6_static, hfloat, 16, 4) []
-+  = { 0xCB5E /* A * O.  */,
-+      0xFC00 /* B * O.  */,
-+      0x50C9 /* C * O.  */,
-+      0x7C00 /* D * O.  */ };
-+
-+VECT_VAR_DECL (expected_laneq7_static, hfloat, 16, 4) []
-+  = { 0x8000 /* A * P.  */,
-+      0xC000 /* FP16_C (-2.0f).  */,
-+      0x0000 /* C * P.  */,
-+      0x4000 /* FP16_C (2.0f).  */ };
-+
-+VECT_VAR_DECL (expected_laneq0_static, hfloat, 16, 8) []
-+  = { 0x8000 /* A * E.  */,
-+      0xC000 /* FP16_C (-2.0f).  */,
-+      0x0000 /* C * E.  */,
-+      0x4000 /* FP16_C (2.0f).  */,
-+      0x8000 /* I * E.  */,
-+      0x0000 /* J * E.  */,
-+      0x4000 /* FP16_C (2.0f).  */,
-+      0x8000 /* L * E.  */  };
-+
-+VECT_VAR_DECL (expected_laneq1_static, hfloat, 16, 8) []
-+  = { 0x5BFF /* A * F.  */,
-+      0x7C00 /* B * F.  */,
-+      0xE131 /* C * F.  */,
-+      0xFC00 /* D * F.  */,
-+      0x4AAF /* I * F.  */,
-+      0xE5D1 /* J * F.  */,
-+      0xFC00 /* K * F.  */,
-+      0x674F /* L * F.  */ };
-+
-+VECT_VAR_DECL (expected_laneq2_static, hfloat, 16, 8) []
-+  = { 0xD405 /* A * G.  */,
-+      0xFC00 /* B * G.  */,
-+      0x5939 /* C * G.  */,
-+      0x7C00 /* D * G.  */,
-+      0xC2B9 /* I * G.  */,
-+      0x5DDA /* J * G.  */,
-+      0x7C00 /* K * G.  */,
-+      0xDF5A /* L * G.  */ };
-+
-+VECT_VAR_DECL (expected_laneq3_static, hfloat, 16, 8) []
-+  = { 0x0000 /* A * H.  */,
-+      0x4000 /* FP16_C (2.0f).  */,
-+      0x8000 /* C * H.  */,
-+      0xC000 /* FP16_C (-2.0f).  */,
-+      0x0000 /* I * H.  */,
-+      0x8000 /* J * H.  */,
-+      0xC000 /* FP16_C (-2.0f).  */,
-+      0x0000 /* L * H.  */ };
-+
-+VECT_VAR_DECL (expected_laneq4_static, hfloat, 16, 8) []
-+  = { 0x648F /* A * M.  */,
-+      0x7C00 /* B * M.  */,
-+      0xE9ED /* C * M.  */,
-+      0xFC00 /* D * M.  */,
-+      0x53A0 /* I * M.  */,
-+      0xEEA3 /* J * M.  */,
-+      0xFC00 /* K * M.  */,
-+      0x702B /* L * M.  */ };
-+
-+VECT_VAR_DECL (expected_laneq5_static, hfloat, 16, 8) []
-+  = { 0xD6B3 /* A * N.  */,
-+      0xFC00 /* B * N.  */,
-+      0x5C5A /* C * N.  */,
-+      0x7C00 /* D * N.  */,
-+      0xC59A /* I * N.  */,
-+      0x60E0 /* J * N.  */,
-+      0x7C00 /* K * N.  */,
-+      0xE220 /* L * N.  */ };
-+
-+VECT_VAR_DECL (expected_laneq6_static, hfloat, 16, 8) []
-+  = { 0xCB5E /* A * O.  */,
-+      0xFC00 /* B * O.  */,
-+      0x50C9 /* C * O.  */,
-+      0x7C00 /* D * O.  */,
-+      0xBA29 /* I * O.  */,
-+      0x555C /* J * O.  */,
-+      0x7C00 /* K * O.  */,
-+      0xD6BC /* L * O.  */ };
-+
-+VECT_VAR_DECL (expected_laneq7_static, hfloat, 16, 8) []
-+  = { 0x8000 /* A * P.  */,
-+      0xC000 /* FP16_C (-2.0f).  */,
-+      0x0000 /* C * P.  */,
-+      0x4000 /* FP16_C (2.0f).  */,
-+      0x8000 /* I * P.  */,
-+      0x0000 /* J * P.  */,
-+      0x4000 /* FP16_C (2.0f).  */,
-+      0x8000 /* L * P.  */ };
-+
-+void exec_vmulx_lane_f16 (void)
-+{
-+#undef TEST_MSG
-+#define TEST_MSG "VMULX_LANE (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc_1, float, 16, 4);
-+  DECL_VARIABLE(vsrc_2, float, 16, 4);
-+  VECT_VAR_DECL (buf_src_1, float, 16, 4) [] = {A, B, C, D};
-+  VECT_VAR_DECL (buf_src_2, float, 16, 4) [] = {E, F, G, H};
-+  VLOAD (vsrc_1, buf_src_1, , float, f, 16, 4);
-+  VLOAD (vsrc_2, buf_src_2, , float, f, 16, 4);
-+  DECL_VARIABLE (vector_res, float, 16, 4)
-+    = vmulx_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 4), 0);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected0_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmulx_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 4), 1);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected1_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmulx_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 4), 2);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected2_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmulx_lane_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		      VECT_VAR (vsrc_2, float, 16, 4), 3);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected3_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VMULXQ_LANE (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc_1, float, 16, 8);
-+  VECT_VAR_DECL (buf_src_1, float, 16, 8) [] = {A, B, C, D, I, J, K, L};
-+  VLOAD (vsrc_1, buf_src_1, q, float, f, 16, 8);
-+  DECL_VARIABLE (vector_res, float, 16, 8)
-+    = vmulxq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 4), 0);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected0_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulxq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 4), 1);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected1_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulxq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 4), 2);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected2_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulxq_lane_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		       VECT_VAR (vsrc_2, float, 16, 4), 3);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected3_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VMULX_LANEQ (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc_2, float, 16, 8);
-+  VECT_VAR_DECL (buf_src_2, float, 16, 8) [] = {E, F, G, H, M, N, O, P};
-+  VLOAD (vsrc_2, buf_src_2, q, float, f, 16, 8);
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmulx_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		       VECT_VAR (vsrc_2, float, 16, 8), 0);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq0_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmulx_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		       VECT_VAR (vsrc_2, float, 16, 8), 1);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq1_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmulx_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		       VECT_VAR (vsrc_2, float, 16, 8), 2);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq2_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmulx_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		       VECT_VAR (vsrc_2, float, 16, 8), 3);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq3_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmulx_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		       VECT_VAR (vsrc_2, float, 16, 8), 4);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq4_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmulx_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		       VECT_VAR (vsrc_2, float, 16, 8), 5);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq5_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmulx_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		       VECT_VAR (vsrc_2, float, 16, 8), 6);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq6_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmulx_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		       VECT_VAR (vsrc_2, float, 16, 8), 7);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_laneq7_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VMULXQ_LANEQ (FP16)"
-+  clean_results ();
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulxq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+			VECT_VAR (vsrc_2, float, 16, 8), 0);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq0_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulxq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+			VECT_VAR (vsrc_2, float, 16, 8), 1);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq1_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulxq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+			VECT_VAR (vsrc_2, float, 16, 8), 2);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq2_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulxq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+			VECT_VAR (vsrc_2, float, 16, 8), 3);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq3_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulxq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+			VECT_VAR (vsrc_2, float, 16, 8), 4);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq4_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulxq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+			VECT_VAR (vsrc_2, float, 16, 8), 5);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq5_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulxq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+			VECT_VAR (vsrc_2, float, 16, 8), 6);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq6_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulxq_laneq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+			VECT_VAR (vsrc_2, float, 16, 8), 7);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_laneq7_static, "");
-+}
-+
-+int
-+main (void)
-+{
-+  exec_vmulx_lane_f16 ();
-+  return 0;
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmulx_n_f16_1.c
-@@ -0,0 +1,177 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_neon } */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+#define FP16_C(a) ((__fp16) a)
-+#define A FP16_C (13.4)
-+#define B FP16_C (__builtin_inff ())
-+#define C FP16_C (-34.8)
-+#define D FP16_C (-__builtin_inff ())
-+#define E FP16_C (-0.0)
-+#define F FP16_C (19.1)
-+#define G FP16_C (-4.8)
-+#define H FP16_C (0.0)
-+
-+float16_t elemE = E;
-+float16_t elemF = F;
-+float16_t elemG = G;
-+float16_t elemH = H;
-+
-+#define I FP16_C (0.7)
-+#define J FP16_C (-78)
-+#define K FP16_C (11.23)
-+#define L FP16_C (98)
-+#define M FP16_C (87.1)
-+#define N FP16_C (-8)
-+#define O FP16_C (-1.1)
-+#define P FP16_C (-9.7)
-+
-+/* Expected results for vmulx_n.  */
-+VECT_VAR_DECL (expected0_static, hfloat, 16, 4) []
-+  = { 0x8000 /* A * E.  */,
-+      0xC000 /* FP16_C (-2.0f).  */,
-+      0x0000 /* C * E.  */,
-+      0x4000 /* FP16_C (2.0f).  */ };
-+
-+VECT_VAR_DECL (expected1_static, hfloat, 16, 4) []
-+  = { 0x5BFF /* A * F.  */,
-+      0x7C00 /* B * F.  */,
-+      0xE131 /* C * F.  */,
-+      0xFC00 /* D * F.  */ };
-+
-+VECT_VAR_DECL (expected2_static, hfloat, 16, 4) []
-+  = { 0xD405 /* A * G.  */,
-+      0xFC00 /* B * G.  */,
-+      0x5939 /* C * G.  */,
-+      0x7C00 /* D * G.  */ };
-+
-+VECT_VAR_DECL (expected3_static, hfloat, 16, 4) []
-+  = { 0x0000 /* A * H.  */,
-+      0x4000 /* FP16_C (2.0f).  */,
-+      0x8000 /* C * H.  */,
-+      0xC000 /* FP16_C (-2.0f).  */ };
-+
-+VECT_VAR_DECL (expected0_static, hfloat, 16, 8) []
-+  = { 0x8000 /* A * E.  */,
-+      0xC000 /* FP16_C (-2.0f).  */,
-+      0x0000 /* C * E.  */,
-+      0x4000 /* FP16_C (2.0f).  */,
-+      0x8000 /* I * E.  */,
-+      0x0000 /* J * E.  */,
-+      0x8000 /* K * E.  */,
-+      0x8000 /* L * E.  */ };
-+
-+VECT_VAR_DECL (expected1_static, hfloat, 16, 8) []
-+  = { 0x5BFF /* A * F.  */,
-+      0x7C00 /* B * F.  */,
-+      0xE131 /* C * F.  */,
-+      0xFC00 /* D * F.  */,
-+      0x4AAF /* I * F.  */,
-+      0xE5D1 /* J * F.  */,
-+      0x5AB3 /* K * F.  */,
-+      0x674F /* L * F.  */ };
-+
-+VECT_VAR_DECL (expected2_static, hfloat, 16, 8) []
-+  = { 0xD405 /* A * G.  */,
-+      0xFC00 /* B * G.  */,
-+      0x5939 /* C * G.  */,
-+      0x7C00 /* D * G.  */,
-+      0xC2B9 /* I * G.  */,
-+      0x5DDA /* J * G.  */,
-+      0xD2BD /* K * G.  */,
-+      0xDF5A /* L * G.  */ };
-+
-+VECT_VAR_DECL (expected3_static, hfloat, 16, 8) []
-+  = { 0x0000 /* A * H.  */,
-+      0x4000 /* FP16_C (2.0f).  */,
-+      0x8000 /* C * H.  */,
-+      0xC000 /* FP16_C (-2.0f).  */,
-+      0x0000 /* I * H.  */,
-+      0x8000 /* J * H.  */,
-+      0x0000 /* K * H.  */,
-+      0x0000 /* L * H.  */ };
-+
-+void exec_vmulx_n_f16 (void)
-+{
-+#undef TEST_MSG
-+#define TEST_MSG "VMULX_N (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE (vsrc_1, float, 16, 4);
-+  VECT_VAR_DECL (buf_src_1, float, 16, 4) [] = {A, B, C, D};
-+  VLOAD (vsrc_1, buf_src_1, , float, f, 16, 4);
-+  DECL_VARIABLE (vector_res, float, 16, 4)
-+    = vmulx_n_f16 (VECT_VAR (vsrc_1, float, 16, 4), elemE);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected0_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmulx_n_f16 (VECT_VAR (vsrc_1, float, 16, 4), elemF);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected1_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmulx_n_f16 (VECT_VAR (vsrc_1, float, 16, 4), elemG);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected2_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vmulx_n_f16 (VECT_VAR (vsrc_1, float, 16, 4), elemH);
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected3_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VMULXQ_N (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE (vsrc_1, float, 16, 8);
-+  VECT_VAR_DECL (buf_src_1, float, 16, 8) [] = {A, B, C, D, I, J, K, L};
-+  VLOAD (vsrc_1, buf_src_1, q, float, f, 16, 8);
-+  DECL_VARIABLE (vector_res, float, 16, 8)
-+    = vmulxq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8), elemE);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected0_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulxq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8), elemF);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected1_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulxq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8), elemG);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected2_static, "");
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vmulxq_n_f16 (VECT_VAR (vsrc_1, float, 16, 8), elemH);
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected3_static, "");
-+}
-+
-+int
-+main (void)
-+{
-+  exec_vmulx_n_f16 ();
-+  return 0;
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmulxh_f16_1.c
-@@ -0,0 +1,50 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+#define A 13.4
-+#define B __builtin_inff ()
-+#define C -34.8
-+#define D -__builtin_inff ()
-+#define E 63.1
-+#define F 0.0
-+#define G -4.8
-+#define H 0.0
-+
-+#define I 0.7
-+#define J -__builtin_inff ()
-+#define K 11.23
-+#define L 98
-+#define M 87.1
-+#define N -0.0
-+#define O -1.1
-+#define P 7
-+
-+float16_t input_1[] = { A, B, C, D, I, J, K, L };
-+float16_t input_2[] = { E, F, G, H, M, N, O, P };
-+uint16_t expected[] = { 0x629B /* A * E.  */,
-+			0x4000 /* FP16_C (2.0f).  */,
-+			0x5939 /* C * G.  */,
-+			0xC000 /* FP16_C (-2.0f).  */,
-+			0x53A0 /* I * M.  */,
-+			0x4000 /* FP16_C (2.0f).  */,
-+			0xCA2C /* K * O.  */,
-+			0x615C /* L * P.  */ };
-+
-+#define TEST_MSG "VMULXH_F16"
-+#define INSN_NAME vmulxh_f16
-+
-+#define INPUT_1 input_1
-+#define INPUT_2 input_2
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "binary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmulxh_lane_f16_1.c
-@@ -0,0 +1,91 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_neon } */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+#define FP16_C(a) ((__fp16) a)
-+#define A FP16_C (13.4)
-+#define B FP16_C (__builtin_inff ())
-+#define C FP16_C (-34.8)
-+#define D FP16_C (-__builtin_inff ())
-+#define E FP16_C (63.1)
-+#define F FP16_C (0.0)
-+#define G FP16_C (-4.8)
-+#define H FP16_C (0.0)
-+
-+#define I FP16_C (0.7)
-+#define J FP16_C (-__builtin_inff ())
-+#define K FP16_C (11.23)
-+#define L FP16_C (98)
-+#define M FP16_C (87.1)
-+#define N FP16_C (-0.0)
-+#define O FP16_C (-1.1)
-+#define P FP16_C (7)
-+
-+extern void abort ();
-+
-+float16_t src1[8] = { A, B, C, D, I, J, K, L };
-+VECT_VAR_DECL (src2, float, 16, 4) [] = { E, F, G, H };
-+VECT_VAR_DECL (src2, float, 16, 8) [] = { E, F, G, H, M, N, O, P };
-+
-+/* Expected results for vmulxh_lane.  */
-+uint16_t expected[4] = { 0x629B /* A * E.  */,
-+			 0x4000 /* FP16_C (2.0f).  */,
-+			 0x5939 /* C * G.  */,
-+			 0xC000 /* FP16_C (-2.0f).  */ };
-+
-+/* Expected results for vmulxh_lane.  */
-+uint16_t expected_laneq[8] = { 0x629B /* A * E.  */,
-+			       0x4000 /* FP16_C (2.0f).  */,
-+			       0x5939 /* C * G.  */,
-+			       0xC000 /* FP16_C (-2.0f).  */,
-+			       0x53A0 /* I * M.  */,
-+			       0x4000 /* FP16_C (2.0f).  */,
-+			       0xCA2C /* K * O.  */,
-+			       0x615C /* L * P.  */ };
-+
-+void exec_vmulxh_lane_f16 (void)
-+{
-+#define CHECK_LANE(N)\
-+  ret = vmulxh_lane_f16 (src1[N], VECT_VAR (vsrc2, float, 16, 4), N);\
-+  if (*(uint16_t *) &ret != expected[N])\
-+    abort ();
-+
-+  DECL_VARIABLE(vsrc2, float, 16, 4);
-+  VLOAD (vsrc2, src2, , float, f, 16, 4);
-+  float16_t ret;
-+
-+  CHECK_LANE(0)
-+  CHECK_LANE(1)
-+  CHECK_LANE(2)
-+  CHECK_LANE(3)
-+
-+#undef CHECK_LANE
-+#define CHECK_LANE(N)\
-+  ret = vmulxh_laneq_f16 (src1[N], VECT_VAR (vsrc2, float, 16, 8), N);\
-+  if (*(uint16_t *) &ret != expected_laneq[N])\
-+    abort ();
-+
-+  DECL_VARIABLE(vsrc2, float, 16, 8);
-+  VLOAD (vsrc2, src2, q, float, f, 16, 8);
-+
-+  CHECK_LANE(0)
-+  CHECK_LANE(1)
-+  CHECK_LANE(2)
-+  CHECK_LANE(3)
-+  CHECK_LANE(4)
-+  CHECK_LANE(5)
-+  CHECK_LANE(6)
-+  CHECK_LANE(7)
-+}
-+
-+int
-+main (void)
-+{
-+  exec_vmulxh_lane_f16 ();
-+  return 0;
-+}
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmvn.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmvn.c
-@@ -120,14 +120,14 @@ FNNAME (INSN_NAME)
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected, "");
-   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
-   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected, "");
-   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
-   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
-   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected, "");
-   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
--  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
- }
- 
- int main (void)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vneg.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vneg.c
-@@ -21,24 +21,53 @@ VECT_VAR_DECL(expected,int,32,4) [] = { 0x10, 0xf, 0xe, 0xd };
- /* Expected results for float32 variants. Needs to be separated since
-    the generic test function does not test floating-point
-    versions.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_float16, hfloat, 16, 4) [] = { 0xc09a, 0xc09a,
-+						      0xc09a, 0xc09a };
-+VECT_VAR_DECL(expected_float16, hfloat, 16, 8) [] = { 0xc2cd, 0xc2cd,
-+						      0xc2cd, 0xc2cd,
-+						      0xc2cd, 0xc2cd,
-+						      0xc2cd, 0xc2cd };
-+#endif
- VECT_VAR_DECL(expected_float32,hfloat,32,2) [] = { 0xc0133333, 0xc0133333 };
- VECT_VAR_DECL(expected_float32,hfloat,32,4) [] = { 0xc059999a, 0xc059999a,
- 						   0xc059999a, 0xc059999a };
- 
- void exec_vneg_f32(void)
- {
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector, float, 16, 4);
-+  DECL_VARIABLE(vector, float, 16, 8);
-+#endif
-   DECL_VARIABLE(vector, float, 32, 2);
-   DECL_VARIABLE(vector, float, 32, 4);
- 
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector_res, float, 16, 4);
-+  DECL_VARIABLE(vector_res, float, 16, 8);
-+#endif
-   DECL_VARIABLE(vector_res, float, 32, 2);
-   DECL_VARIABLE(vector_res, float, 32, 4);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, , float, f, 16, 4, 2.3f);
-+  VDUP(vector, q, float, f, 16, 8, 3.4f);
-+#endif
-   VDUP(vector, , float, f, 32, 2, 2.3f);
-   VDUP(vector, q, float, f, 32, 4, 3.4f);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_UNARY_OP(INSN_NAME, , float, f, 16, 4);
-+  TEST_UNARY_OP(INSN_NAME, q, float, f, 16, 8);
-+#endif
-   TEST_UNARY_OP(INSN_NAME, , float, f, 32, 2);
-   TEST_UNARY_OP(INSN_NAME, q, float, f, 32, 4);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_float16, "");
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_float16, "");
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_float32, "");
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_float32, "");
- }
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vnegh_f16_1.c
-@@ -0,0 +1,39 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+uint16_t expected[] =
-+{
-+  0x8000 /* -0.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0xc000 /* -2.000000 */,
-+  0xc233 /* -3.099609 */,
-+  0xcd00 /* -20.000000 */,
-+  0xb666 /* -0.399902 */,
-+  0x409a /* 2.300781 */,
-+  0xbd52 /* -1.330078 */,
-+  0x479a /* 7.601562 */,
-+  0xb4f6 /* -0.310059 */,
-+  0xb55d /* -0.335205 */,
-+  0xb800 /* -0.500000 */,
-+  0xbc00 /* -1.000000 */,
-+  0xca91 /* -13.132812 */,
-+  0x464d /* 6.300781 */,
-+  0xcd00 /* -20.000000 */,
-+  0xfc00 /* -inf */,
-+  0x7c00 /* inf */
-+};
-+
-+#define TEST_MSG "VNEGH_F16"
-+#define INSN_NAME vnegh_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vpXXX.inc
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vpXXX.inc
-@@ -21,6 +21,9 @@ void FNNAME (INSN_NAME) (void)
-   DECL_VARIABLE(vector, uint, 8, 8);
-   DECL_VARIABLE(vector, uint, 16, 4);
-   DECL_VARIABLE(vector, uint, 32, 2);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector, float, 16, 4);
-+#endif
-   DECL_VARIABLE(vector, float, 32, 2);
- 
-   DECL_VARIABLE(vector_res, int, 8, 8);
-@@ -29,6 +32,9 @@ void FNNAME (INSN_NAME) (void)
-   DECL_VARIABLE(vector_res, uint, 8, 8);
-   DECL_VARIABLE(vector_res, uint, 16, 4);
-   DECL_VARIABLE(vector_res, uint, 32, 2);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector_res, float, 16, 4);
-+#endif
-   DECL_VARIABLE(vector_res, float, 32, 2);
- 
-   clean_results ();
-@@ -40,6 +46,9 @@ void FNNAME (INSN_NAME) (void)
-   VLOAD(vector, buffer, , uint, u, 8, 8);
-   VLOAD(vector, buffer, , uint, u, 16, 4);
-   VLOAD(vector, buffer, , uint, u, 32, 2);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VLOAD(vector, buffer, , float, f, 16, 4);
-+#endif
-   VLOAD(vector, buffer, , float, f, 32, 2);
- 
-   /* Apply a binary operator named INSN_NAME.  */
-@@ -49,14 +58,20 @@ void FNNAME (INSN_NAME) (void)
-   TEST_VPXXX(INSN_NAME, uint, u, 8, 8);
-   TEST_VPXXX(INSN_NAME, uint, u, 16, 4);
-   TEST_VPXXX(INSN_NAME, uint, u, 32, 2);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VPXXX(INSN_NAME, float, f, 16, 4);
-+#endif
-   TEST_VPXXX(INSN_NAME, float, f, 32, 2);
- 
--  CHECK(TEST_MSG, int, 8, 8, PRIx32, expected, "");
--  CHECK(TEST_MSG, int, 16, 4, PRIx64, expected, "");
-+  CHECK(TEST_MSG, int, 8, 8, PRIx8, expected, "");
-+  CHECK(TEST_MSG, int, 16, 4, PRIx16, expected, "");
-   CHECK(TEST_MSG, int, 32, 2, PRIx32, expected, "");
--  CHECK(TEST_MSG, uint, 8, 8, PRIx32, expected, "");
--  CHECK(TEST_MSG, uint, 16, 4, PRIx64, expected, "");
-+  CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected, "");
-+  CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, "");
- }
- 
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vpadd.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vpadd.c
-@@ -14,6 +14,9 @@ VECT_VAR_DECL(expected,uint,8,8) [] = { 0xe1, 0xe5, 0xe9, 0xed,
- 					0xe1, 0xe5, 0xe9, 0xed };
- VECT_VAR_DECL(expected,uint,16,4) [] = { 0xffe1, 0xffe5, 0xffe1, 0xffe5 };
- VECT_VAR_DECL(expected,uint,32,2) [] = { 0xffffffe1, 0xffffffe1 };
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xcfc0, 0xcec0, 0xcfc0, 0xcec0 };
-+#endif
- VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1f80000, 0xc1f80000 };
- 
- #include "vpXXX.inc"
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vpmax.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vpmax.c
-@@ -15,6 +15,9 @@ VECT_VAR_DECL(expected,uint,8,8) [] = { 0xf1, 0xf3, 0xf5, 0xf7,
- 					0xf1, 0xf3, 0xf5, 0xf7 };
- VECT_VAR_DECL(expected,uint,16,4) [] = { 0xfff1, 0xfff3, 0xfff1, 0xfff3 };
- VECT_VAR_DECL(expected,uint,32,2) [] = { 0xfffffff1, 0xfffffff1 };
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xcb80, 0xca80, 0xcb80, 0xca80 };
-+#endif
- VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1700000, 0xc1700000 };
- 
- #include "vpXXX.inc"
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vpmin.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vpmin.c
-@@ -15,6 +15,9 @@ VECT_VAR_DECL(expected,uint,8,8) [] = { 0xf0, 0xf2, 0xf4, 0xf6,
- 					0xf0, 0xf2, 0xf4, 0xf6 };
- VECT_VAR_DECL(expected,uint,16,4) [] = { 0xfff0, 0xfff2, 0xfff0, 0xfff2 };
- VECT_VAR_DECL(expected,uint,32,2) [] = { 0xfffffff0, 0xfffffff0 };
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xcc00, 0xcb00, 0xcc00, 0xcb00 };
-+#endif
- VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800000, 0xc1800000 };
- 
- #include "vpXXX.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vpminmaxnm_f16_1.c
-@@ -0,0 +1,114 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_neon } */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+#define FP16_C(a) ((__fp16) a)
-+#define A FP16_C (123.4)
-+#define B FP16_C (__builtin_nanf ("")) /* NaN */
-+#define C FP16_C (-34.8)
-+#define D FP16_C (1024)
-+#define E FP16_C (663.1)
-+#define F FP16_C (169.1)
-+#define G FP16_C (-4.8)
-+#define H FP16_C (-__builtin_nanf ("")) /* NaN */
-+
-+#define I FP16_C (0.7)
-+#define J FP16_C (-78)
-+#define K FP16_C (101.23)
-+#define L FP16_C (-1098)
-+#define M FP16_C (870.1)
-+#define N FP16_C (-8781)
-+#define O FP16_C (__builtin_inff ()) /* +Inf */
-+#define P FP16_C (-__builtin_inff ()) /* -Inf */
-+
-+
-+/* Expected results for vpminnm.  */
-+VECT_VAR_DECL (expected_min_static, hfloat, 16, 4) []
-+  = { 0x57B6 /* A.  */, 0xD05A /* C.  */, 0x5949 /* F.  */, 0xC4CD /* G.  */ };
-+
-+VECT_VAR_DECL (expected_min_static, hfloat, 16, 8) []
-+  = { 0x57B6 /* A.  */, 0xD05A /* C.  */, 0xD4E0 /* J.  */, 0xE44A /* L.  */,
-+      0x5949 /* F.  */, 0xC4CD /* G.  */, 0xF04A /* N.  */, 0xFC00 /* P.  */ };
-+
-+/* expected_max results for vpmaxnm.  */
-+VECT_VAR_DECL (expected_max_static, hfloat, 16, 4) []
-+  = { 0x57B6 /* A.  */, 0x6400 /* D.  */, 0x612E /* E.  */, 0xC4CD /* G.  */ };
-+
-+VECT_VAR_DECL (expected_max_static, hfloat, 16, 8) []
-+  = { 0x57B6 /* A.  */, 0x6400 /* D.  */, 0x399A /* I.  */, 0x5654 /* K.  */,
-+      0x612E /* E.  */, 0xC4CD /* G.  */, 0x62CC /* M.  */, 0x7C00 /* O.  */ };
-+
-+void exec_vpminmaxnm_f16 (void)
-+{
-+#undef TEST_MSG
-+#define TEST_MSG "VPMINNM (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc_1, float, 16, 4);
-+  DECL_VARIABLE(vsrc_2, float, 16, 4);
-+  VECT_VAR_DECL (buf_src_1, float, 16, 4) [] = {A, B, C, D};
-+  VECT_VAR_DECL (buf_src_2, float, 16, 4) [] = {E, F, G, H};
-+  VLOAD (vsrc_1, buf_src_1, , float, f, 16, 4);
-+  VLOAD (vsrc_2, buf_src_2, , float, f, 16, 4);
-+  DECL_VARIABLE (vector_res, float, 16, 4)
-+    = vpminnm_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		   VECT_VAR (vsrc_2, float, 16, 4));
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_min_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VPMINNMQ (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc_1, float, 16, 8);
-+  DECL_VARIABLE(vsrc_2, float, 16, 8);
-+  VECT_VAR_DECL (buf_src_1, float, 16, 8) [] = {A, B, C, D, I, J, K, L};
-+  VECT_VAR_DECL (buf_src_2, float, 16, 8) [] = {E, F, G, H, M, N, O, P};
-+  VLOAD (vsrc_1, buf_src_1, q, float, f, 16, 8);
-+  VLOAD (vsrc_2, buf_src_2, q, float, f, 16, 8);
-+  DECL_VARIABLE (vector_res, float, 16, 8)
-+    = vpminnmq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		    VECT_VAR (vsrc_2, float, 16, 8));
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_min_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VPMAXNM (FP16)"
-+  clean_results ();
-+
-+  VECT_VAR (vector_res, float, 16, 4)
-+    = vpmaxnm_f16 (VECT_VAR (vsrc_1, float, 16, 4),
-+		   VECT_VAR (vsrc_2, float, 16, 4));
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_max_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VPMAXNMQ (FP16)"
-+  clean_results ();
-+
-+  VECT_VAR (vector_res, float, 16, 8)
-+    = vpmaxnmq_f16 (VECT_VAR (vsrc_1, float, 16, 8),
-+		    VECT_VAR (vsrc_2, float, 16, 8));
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_max_static, "");
-+}
-+
-+int
-+main (void)
-+{
-+  exec_vpminmaxnm_f16 ();
-+  return 0;
-+}
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqabs.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqabs.c
-@@ -90,9 +90,9 @@ void vqabs_extra()
-   TEST_UNARY_SAT_OP(INSN_NAME, q, int, s, 32, 4, expected_cumulative_sat_min_neg, MSG);
- 
-   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_min_neg, MSG);
--  CHECK(TEST_MSG, int, 16, 4, PRIx8, expected_min_neg, MSG);
--  CHECK(TEST_MSG, int, 32, 2, PRIx8, expected_min_neg, MSG);
-+  CHECK(TEST_MSG, int, 16, 4, PRIx16, expected_min_neg, MSG);
-+  CHECK(TEST_MSG, int, 32, 2, PRIx32, expected_min_neg, MSG);
-   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_min_neg, MSG);
--  CHECK(TEST_MSG, int, 16, 8, PRIx8, expected_min_neg, MSG);
--  CHECK(TEST_MSG, int, 32, 4, PRIx8, expected_min_neg, MSG);
-+  CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_min_neg, MSG);
-+  CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_min_neg, MSG);
- }
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqdmull.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqdmull.c
-@@ -63,8 +63,8 @@ void FNNAME (INSN_NAME) (void)
-   TEST_VQDMULL(int, s, 16, 32, 4, expected_cumulative_sat, "");
-   TEST_VQDMULL(int, s, 32, 64, 2, expected_cumulative_sat, "");
- 
--  CHECK (TEST_MSG, int, 32, 4, PRIx16, expected, "");
--  CHECK (TEST_MSG, int, 64, 2, PRIx32, expected, "");
-+  CHECK (TEST_MSG, int, 32, 4, PRIx32, expected, "");
-+  CHECK (TEST_MSG, int, 64, 2, PRIx64, expected, "");
- 
-   VDUP(vector, , int, s, 16, 4, 0x8000);
-   VDUP(vector2, , int, s, 16, 4, 0x8000);
-@@ -75,8 +75,8 @@ void FNNAME (INSN_NAME) (void)
-   TEST_VQDMULL(int, s, 16, 32, 4, expected_cumulative_sat2, TEST_MSG2);
-   TEST_VQDMULL(int, s, 32, 64, 2, expected_cumulative_sat2, TEST_MSG2);
- 
--  CHECK (TEST_MSG, int, 32, 4, PRIx16, expected2, TEST_MSG2);
--  CHECK (TEST_MSG, int, 64, 2, PRIx32, expected2, TEST_MSG2);
-+  CHECK (TEST_MSG, int, 32, 4, PRIx32, expected2, TEST_MSG2);
-+  CHECK (TEST_MSG, int, 64, 2, PRIx64, expected2, TEST_MSG2);
- }
- 
- int main (void)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqneg.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqneg.c
-@@ -90,9 +90,9 @@ void vqneg_extra()
-   TEST_UNARY_SAT_OP(INSN_NAME, q, int, s, 32, 4, expected_cumulative_sat_min_neg, MSG);
- 
-   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_min_neg, MSG);
--  CHECK(TEST_MSG, int, 16, 4, PRIx8, expected_min_neg, MSG);
--  CHECK(TEST_MSG, int, 32, 2, PRIx8, expected_min_neg, MSG);
-+  CHECK(TEST_MSG, int, 16, 4, PRIx16, expected_min_neg, MSG);
-+  CHECK(TEST_MSG, int, 32, 2, PRIx32, expected_min_neg, MSG);
-   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_min_neg, MSG);
--  CHECK(TEST_MSG, int, 16, 8, PRIx8, expected_min_neg, MSG);
--  CHECK(TEST_MSG, int, 32, 4, PRIx8, expected_min_neg, MSG);
-+  CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_min_neg, MSG);
-+  CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_min_neg, MSG);
- }
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqtbX.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqtbX.c
-@@ -318,13 +318,13 @@ void exec_vqtbX (void)
- 
-   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vqtbl1, "");
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vqtbl1, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbl1, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbl1, "");
- 
- #undef TEST_MSG
- #define TEST_MSG "VQTBL1Q"
-   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vqtbl1q, "");
-   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vqtbl1q, "");
--  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbl1q, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbl1q, "");
- 
-   /* Check vqtbl2.  */
-   clean_results ();
-@@ -334,13 +334,13 @@ void exec_vqtbX (void)
- 
-   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vqtbl2, "");
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vqtbl2, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbl2, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbl2, "");
- 
- #undef TEST_MSG
- #define TEST_MSG "VQTBL2Q"
-   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vqtbl2q, "");
-   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vqtbl2q, "");
--  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbl2q, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbl2q, "");
- 
-   /* Check vqtbl3.  */
-   clean_results ();
-@@ -350,13 +350,13 @@ void exec_vqtbX (void)
- 
-   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vqtbl3, "");
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vqtbl3, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbl3, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbl3, "");
- 
- #undef TEST_MSG
- #define TEST_MSG "VQTBL3Q"
-   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vqtbl3q, "");
-   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vqtbl3q, "");
--  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbl3q, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbl3q, "");
- 
-   /* Check vqtbl4.  */
-   clean_results ();
-@@ -366,13 +366,13 @@ void exec_vqtbX (void)
- 
-   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vqtbl4, "");
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vqtbl4, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbl4, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbl4, "");
- 
- #undef TEST_MSG
- #define TEST_MSG "VQTBL4Q"
-   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vqtbl4q, "");
-   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vqtbl4q, "");
--  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbl4q, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbl4q, "");
- 
- 
-   /* Now test VQTBX.  */
-@@ -455,13 +455,13 @@ void exec_vqtbX (void)
- 
-   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vqtbx1, "");
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vqtbx1, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbx1, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbx1, "");
- 
- #undef TEST_MSG
- #define TEST_MSG "VQTBX1Q"
-   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vqtbx1q, "");
-   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vqtbx1q, "");
--  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbx1q, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbx1q, "");
- 
-   /* Check vqtbx2.  */
-   clean_results ();
-@@ -471,13 +471,13 @@ void exec_vqtbX (void)
- 
-   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vqtbx2, "");
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vqtbx2, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbx2, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbx2, "");
- 
- #undef TEST_MSG
- #define TEST_MSG "VQTBX2Q"
-   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vqtbx2q, "");
-   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vqtbx2q, "");
--  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbx2q, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbx2q, "");
- 
-   /* Check vqtbx3.  */
-   clean_results ();
-@@ -487,13 +487,13 @@ void exec_vqtbX (void)
- 
-   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vqtbx3, "");
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vqtbx3, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbx3, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbx3, "");
- 
- #undef TEST_MSG
- #define TEST_MSG "VQTBX3Q"
-   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vqtbx3q, "");
-   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vqtbx3q, "");
--  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbx3q, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbx3q, "");
- 
-   /* Check vqtbx4.  */
-   clean_results ();
-@@ -503,13 +503,13 @@ void exec_vqtbX (void)
- 
-   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vqtbx4, "");
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vqtbx4, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbx4, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vqtbx4, "");
- 
- #undef TEST_MSG
- #define TEST_MSG "VQTBX4Q"
-   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vqtbx4q, "");
-   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vqtbx4q, "");
--  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbx4q, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vqtbx4q, "");
- }
- 
- int main (void)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrecpe.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrecpe.c
-@@ -7,6 +7,14 @@
- VECT_VAR_DECL(expected_positive,uint,32,2) [] = { 0xffffffff, 0xffffffff };
- VECT_VAR_DECL(expected_positive,uint,32,4) [] = { 0xbf000000, 0xbf000000,
- 						  0xbf000000, 0xbf000000 };
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_positive, hfloat, 16, 4) [] = { 0x3834, 0x3834,
-+						       0x3834, 0x3834 };
-+VECT_VAR_DECL(expected_positive, hfloat, 16, 8) [] = { 0x2018, 0x2018,
-+						       0x2018, 0x2018,
-+						       0x2018, 0x2018,
-+						       0x2018, 0x2018 };
-+#endif
- VECT_VAR_DECL(expected_positive,hfloat,32,2) [] = { 0x3f068000, 0x3f068000 };
- VECT_VAR_DECL(expected_positive,hfloat,32,4) [] = { 0x3c030000, 0x3c030000,
- 						    0x3c030000, 0x3c030000 };
-@@ -15,24 +23,56 @@ VECT_VAR_DECL(expected_positive,hfloat,32,4) [] = { 0x3c030000, 0x3c030000,
- VECT_VAR_DECL(expected_negative,uint,32,2) [] = { 0x80000000, 0x80000000 };
- VECT_VAR_DECL(expected_negative,uint,32,4) [] = { 0xee800000, 0xee800000,
- 						  0xee800000, 0xee800000 };
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_negative, hfloat, 16, 4) [] = { 0xae64, 0xae64,
-+						       0xae64, 0xae64 };
-+VECT_VAR_DECL(expected_negative, hfloat, 16, 8) [] = { 0xa018, 0xa018,
-+						       0xa018, 0xa018,
-+						       0xa018, 0xa018,
-+						       0xa018, 0xa018 };
-+#endif
- VECT_VAR_DECL(expected_negative,hfloat,32,2) [] = { 0xbdcc8000, 0xbdcc8000 };
- VECT_VAR_DECL(expected_negative,hfloat,32,4) [] = { 0xbc030000, 0xbc030000,
- 						    0xbc030000, 0xbc030000 };
- 
- /* Expected results with FP special values (NaN, infinity).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_fp1, hfloat, 16, 4) [] = { 0x7e00, 0x7e00,
-+						  0x7e00, 0x7e00 };
-+VECT_VAR_DECL(expected_fp1, hfloat, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0,
-+						  0x0, 0x0, 0x0, 0x0 };
-+#endif
- VECT_VAR_DECL(expected_fp1,hfloat,32,2) [] = { 0x7fc00000, 0x7fc00000 };
- VECT_VAR_DECL(expected_fp1,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
- 
- /* Expected results with FP special values (zero, large value).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_fp2, hfloat, 16, 4) [] = { 0x7c00, 0x7c00,
-+						  0x7c00, 0x7c00 };
-+VECT_VAR_DECL(expected_fp2, hfloat, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0,
-+						  0x0, 0x0, 0x0, 0x0 };
-+#endif
- VECT_VAR_DECL(expected_fp2,hfloat,32,2) [] = { 0x7f800000, 0x7f800000 };
- VECT_VAR_DECL(expected_fp2,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
- 
- /* Expected results with FP special values (-0, -infinity).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_fp3, hfloat, 16, 4) [] = { 0xfc00, 0xfc00,
-+						  0xfc00, 0xfc00};
-+VECT_VAR_DECL(expected_fp3, hfloat, 16, 8) [] = { 0x8000, 0x8000,
-+						  0x8000, 0x8000,
-+						  0x8000, 0x8000,
-+						  0x8000, 0x8000 };
-+#endif
- VECT_VAR_DECL(expected_fp3,hfloat,32,2) [] = { 0xff800000, 0xff800000 };
- VECT_VAR_DECL(expected_fp3,hfloat,32,4) [] = { 0x80000000, 0x80000000,
- 					       0x80000000, 0x80000000 };
- 
- /* Expected results with FP special large negative value.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_fp4, hfloat, 16, 4) [] = { 0x8000, 0x8000,
-+						  0x8000, 0x8000 };
-+#endif
- VECT_VAR_DECL(expected_fp4,hfloat,32,2) [] = { 0x80000000, 0x80000000 };
- 
- #define TEST_MSG "VRECPE/VRECPEQ"
-@@ -50,11 +90,19 @@ void exec_vrecpe(void)
-   /* No need for 64 bits variants.  */
-   DECL_VARIABLE(vector, uint, 32, 2);
-   DECL_VARIABLE(vector, uint, 32, 4);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector, float, 16, 4);
-+  DECL_VARIABLE(vector, float, 16, 8);
-+#endif
-   DECL_VARIABLE(vector, float, 32, 2);
-   DECL_VARIABLE(vector, float, 32, 4);
- 
-   DECL_VARIABLE(vector_res, uint, 32, 2);
-   DECL_VARIABLE(vector_res, uint, 32, 4);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector_res, float, 16, 4);
-+  DECL_VARIABLE(vector_res, float, 16, 8);
-+#endif
-   DECL_VARIABLE(vector_res, float, 32, 2);
-   DECL_VARIABLE(vector_res, float, 32, 4);
- 
-@@ -62,88 +110,165 @@ void exec_vrecpe(void)
- 
-   /* Choose init value arbitrarily, positive.  */
-   VDUP(vector, , uint, u, 32, 2, 0x12345678);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, , float, f, 16, 4, 1.9f);
-+#endif
-   VDUP(vector, , float, f, 32, 2, 1.9f);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, q, float, f, 16, 8, 125.0f);
-+#endif
-   VDUP(vector, q, uint, u, 32, 4, 0xABCDEF10);
-   VDUP(vector, q, float, f, 32, 4, 125.0f);
- 
-   /* Apply the operator.  */
-   TEST_VRECPE(, uint, u, 32, 2);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VRECPE(, float, f, 16, 4);
-+#endif
-   TEST_VRECPE(, float, f, 32, 2);
-   TEST_VRECPE(q, uint, u, 32, 4);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VRECPE(q, float, f, 16, 8);
-+#endif
-   TEST_VRECPE(q, float, f, 32, 4);
- 
- #define CMT " (positive input)"
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_positive, CMT);
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_positive, CMT);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_positive, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_positive, CMT);
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_positive, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_positive, CMT);
- 
-   /* Choose init value arbitrarily,negative.  */
-   VDUP(vector, , uint, u, 32, 2, 0xFFFFFFFF);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, , float, f, 16, 4, -10.0f);
-+#endif
-   VDUP(vector, , float, f, 32, 2, -10.0f);
-   VDUP(vector, q, uint, u, 32, 4, 0x89081234);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, q, float, f, 16, 8, -125.0f);
-+#endif
-   VDUP(vector, q, float, f, 32, 4, -125.0f);
- 
-   /* Apply the operator.  */
-   TEST_VRECPE(, uint, u, 32, 2);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VRECPE(, float, f, 16, 4);
-+#endif
-   TEST_VRECPE(, float, f, 32, 2);
-   TEST_VRECPE(q, uint, u, 32, 4);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VRECPE(q, float, f, 16, 8);
-+#endif
-   TEST_VRECPE(q, float, f, 32, 4);
- 
- #undef CMT
- #define CMT " (negative input)"
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_negative, CMT);
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_negative, CMT);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_negative, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_negative, CMT);
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_negative, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_negative, CMT);
- 
-   /* Test FP variants with special input values (NaN, infinity).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, , float, f, 16, 4, NAN);
-+  VDUP(vector, q, float, f, 16, 8, HUGE_VALF);
-+#endif
-   VDUP(vector, , float, f, 32, 2, NAN);
-   VDUP(vector, q, float, f, 32, 4, HUGE_VALF);
- 
-   /* Apply the operator.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VRECPE(, float, f, 16, 4);
-+  TEST_VRECPE(q, float, f, 16, 8);
-+#endif
-   TEST_VRECPE(, float, f, 32, 2);
-   TEST_VRECPE(q, float, f, 32, 4);
- 
- #undef CMT
- #define CMT " FP special (NaN, infinity)"
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_fp1, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_fp1, CMT);
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_fp1, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_fp1, CMT);
- 
-   /* Test FP variants with special input values (zero, large value).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, , float, f, 16, 4, 0.0f);
-+  VDUP(vector, q, float, f, 16, 8, 8.97229e37f /*9.0e37f*/);
-+#endif
-   VDUP(vector, , float, f, 32, 2, 0.0f);
-   VDUP(vector, q, float, f, 32, 4, 8.97229e37f /*9.0e37f*/);
- 
-   /* Apply the operator.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VRECPE(, float, f, 16, 4);
-+  TEST_VRECPE(q, float, f, 16, 8);
-+#endif
-   TEST_VRECPE(, float, f, 32, 2);
-   TEST_VRECPE(q, float, f, 32, 4);
- 
- #undef CMT
- #define CMT " FP special (zero, large value)"
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_fp2, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_fp2, CMT);
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_fp2, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_fp2, CMT);
- 
-   /* Test FP variants with special input values (-0, -infinity).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, , float, f, 16, 4, -0.0f);
-+  VDUP(vector, q, float, f, 16, 8, -HUGE_VALF);
-+#endif
-   VDUP(vector, , float, f, 32, 2, -0.0f);
-   VDUP(vector, q, float, f, 32, 4, -HUGE_VALF);
- 
-   /* Apply the operator.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VRECPE(, float, f, 16, 4);
-+  TEST_VRECPE(q, float, f, 16, 8);
-+#endif
-   TEST_VRECPE(, float, f, 32, 2);
-   TEST_VRECPE(q, float, f, 32, 4);
- 
- #undef CMT
- #define CMT " FP special (-0, -infinity)"
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_fp3, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_fp3, CMT);
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_fp3, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_fp3, CMT);
- 
-   /* Test FP variants with special input values (large negative value).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, , float, f, 16, 4, -9.0e37f);
-+#endif
-   VDUP(vector, , float, f, 32, 2, -9.0e37f);
- 
-   /* Apply the operator.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VRECPE(, float, f, 16, 4);
-+#endif
-   TEST_VRECPE(, float, f, 32, 2);
- 
- #undef CMT
- #define CMT " FP special (large negative value)"
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_fp4, CMT);
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_fp4, CMT);
- }
- 
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrecpeh_f16_1.c
-@@ -0,0 +1,42 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+#define A 123.4
-+#define B 567.8
-+#define C 34.8
-+#define D 1024
-+#define E 663.1
-+#define F 144.0
-+#define G 4.8
-+#define H 77
-+
-+#define RECP_A 0x2028 /* 1/A.  */
-+#define RECP_B 0x1734 /* 1/B.  */
-+#define RECP_C 0x275C /* 1/C.  */
-+#define RECP_D 0x13FC /* 1/D.  */
-+#define RECP_E 0x162C /* 1/E.  */
-+#define RECP_F 0x1F18 /* 1/F.  */
-+#define RECP_G 0x32A8 /* 1/G.  */
-+#define RECP_H 0x22A4 /* 1/H.  */
-+
-+float16_t input[] = { A, B, C, D, E, F, G, H };
-+uint16_t expected[] = { RECP_A, RECP_B, RECP_C, RECP_D,
-+		        RECP_E, RECP_F, RECP_G, RECP_H };
-+
-+#define TEST_MSG "VRECPEH_F16"
-+#define INSN_NAME vrecpeh_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrecps.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrecps.c
-@@ -4,22 +4,51 @@
- #include <math.h>
- 
- /* Expected results with positive input.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xd70c, 0xd70c, 0xd70c, 0xd70c };
-+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0xcedc, 0xcedc, 0xcedc, 0xcedc,
-+					      0xcedc, 0xcedc, 0xcedc, 0xcedc };
-+#endif
- VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc2e19eb7, 0xc2e19eb7 };
- VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1db851f, 0xc1db851f,
- 					   0xc1db851f, 0xc1db851f };
- 
- /* Expected results with FP special values (NaN).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_fp1, hfloat, 16, 4) [] = { 0x7e00, 0x7e00,
-+						  0x7e00, 0x7e00 };
-+VECT_VAR_DECL(expected_fp1, hfloat, 16, 8) [] = { 0x7e00, 0x7e00,
-+						  0x7e00, 0x7e00,
-+						  0x7e00, 0x7e00,
-+						  0x7e00, 0x7e00 };
-+#endif
- VECT_VAR_DECL(expected_fp1,hfloat,32,2) [] = { 0x7fc00000, 0x7fc00000 };
- VECT_VAR_DECL(expected_fp1,hfloat,32,4) [] = { 0x7fc00000, 0x7fc00000,
- 					       0x7fc00000, 0x7fc00000 };
- 
- /* Expected results with FP special values (infinity, 0) and normal
-    values.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_fp2, hfloat, 16, 4) [] = { 0xfc00, 0xfc00,
-+						  0xfc00, 0xfc00 };
-+VECT_VAR_DECL(expected_fp2, hfloat, 16, 8) [] = { 0x4000, 0x4000,
-+						  0x4000, 0x4000,
-+						  0x4000, 0x4000,
-+						  0x4000, 0x4000 };
-+#endif
- VECT_VAR_DECL(expected_fp2,hfloat,32,2) [] = { 0xff800000, 0xff800000 };
- VECT_VAR_DECL(expected_fp2,hfloat,32,4) [] = { 0x40000000, 0x40000000,
- 					       0x40000000, 0x40000000 };
- 
- /* Expected results with FP special values (infinity, 0).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_fp3, hfloat, 16, 4) [] = { 0x4000, 0x4000,
-+						  0x4000, 0x4000 };
-+VECT_VAR_DECL(expected_fp3, hfloat, 16, 8) [] = { 0x4000, 0x4000,
-+						  0x4000, 0x4000,
-+						  0x4000, 0x4000,
-+						  0x4000, 0x4000 };
-+#endif
- VECT_VAR_DECL(expected_fp3,hfloat,32,2) [] = { 0x40000000, 0x40000000 };
- VECT_VAR_DECL(expected_fp3,hfloat,32,4) [] = { 0x40000000, 0x40000000,
- 					       0x40000000, 0x40000000 };
-@@ -38,74 +67,143 @@ void exec_vrecps(void)
- 		    VECT_VAR(vector_res, T1, W, N))
- 
-   /* No need for integer variants.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector, float, 16, 4);
-+  DECL_VARIABLE(vector, float, 16, 8);
-+#endif
-   DECL_VARIABLE(vector, float, 32, 2);
-   DECL_VARIABLE(vector, float, 32, 4);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector2, float, 16, 4);
-+  DECL_VARIABLE(vector2, float, 16, 8);
-+#endif
-   DECL_VARIABLE(vector2, float, 32, 2);
-   DECL_VARIABLE(vector2, float, 32, 4);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector_res, float, 16, 4);
-+  DECL_VARIABLE(vector_res, float, 16, 8);
-+#endif
-   DECL_VARIABLE(vector_res, float, 32, 2);
-   DECL_VARIABLE(vector_res, float, 32, 4);
- 
-   clean_results ();
- 
-   /* Choose init value arbitrarily.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, , float, f, 16, 4, 12.9f);
-+  VDUP(vector, q, float, f, 16, 8, 9.2f);
-+#endif
-   VDUP(vector, , float, f, 32, 2, 12.9f);
-   VDUP(vector, q, float, f, 32, 4, 9.2f);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector2, , float, f, 16, 4, 8.9f);
-+  VDUP(vector2, q, float, f, 16, 8, 3.2f);
-+#endif
-   VDUP(vector2, , float, f, 32, 2, 8.9f);
-   VDUP(vector2, q, float, f, 32, 4, 3.2f);
- 
-   /* Apply the operator.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VRECPS(, float, f, 16, 4);
-+  TEST_VRECPS(q, float, f, 16, 8);
-+#endif
-   TEST_VRECPS(, float, f, 32, 2);
-   TEST_VRECPS(q, float, f, 32, 4);
- 
- #define CMT " (positive input)"
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, CMT);
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected, CMT);
- 
- 
-   /* Test FP variants with special input values (NaN).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, , float, f, 16, 4, NAN);
-+  VDUP(vector2, q, float, f, 16, 8, NAN);
-+#endif
-   VDUP(vector, , float, f, 32, 2, NAN);
-   VDUP(vector2, q, float, f, 32, 4, NAN);
- 
-   /* Apply the operator.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VRECPS(, float, f, 16, 4);
-+  TEST_VRECPS(q, float, f, 16, 8);
-+#endif
-   TEST_VRECPS(, float, f, 32, 2);
-   TEST_VRECPS(q, float, f, 32, 4);
- 
- #undef CMT
- #define CMT " FP special (NaN)"
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_fp1, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_fp1, CMT);
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_fp1, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_fp1, CMT);
- 
- 
-   /* Test FP variants with special input values (infinity, 0).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, , float, f, 16, 4, HUGE_VALF);
-+  VDUP(vector, q, float, f, 16, 8, 0.0f);
-+  VDUP(vector2, q, float, f, 16, 8, 3.2f); /* Restore a normal value.  */
-+#endif
-   VDUP(vector, , float, f, 32, 2, HUGE_VALF);
-   VDUP(vector, q, float, f, 32, 4, 0.0f);
-   VDUP(vector2, q, float, f, 32, 4, 3.2f); /* Restore a normal value.  */
- 
-+
-   /* Apply the operator.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VRECPS(, float, f, 16, 4);
-+  TEST_VRECPS(q, float, f, 16, 8);
-+#endif
-   TEST_VRECPS(, float, f, 32, 2);
-   TEST_VRECPS(q, float, f, 32, 4);
- 
- #undef CMT
- #define CMT " FP special (infinity, 0) and normal value"
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_fp2, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_fp2, CMT);
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_fp2, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_fp2, CMT);
- 
- 
-   /* Test FP variants with only special input values (infinity, 0).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, , float, f, 16, 4, HUGE_VALF);
-+  VDUP(vector, q, float, f, 16, 8, 0.0f);
-+  VDUP(vector2, , float, f, 16, 4, 0.0f);
-+  VDUP(vector2, q, float, f, 16, 8, HUGE_VALF);
-+#endif
-   VDUP(vector, , float, f, 32, 2, HUGE_VALF);
-   VDUP(vector, q, float, f, 32, 4, 0.0f);
-   VDUP(vector2, , float, f, 32, 2, 0.0f);
-   VDUP(vector2, q, float, f, 32, 4, HUGE_VALF);
- 
-+
-   /* Apply the operator */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VRECPS(, float, f, 16, 4);
-+  TEST_VRECPS(q, float, f, 16, 8);
-+#endif
-   TEST_VRECPS(, float, f, 32, 2);
-   TEST_VRECPS(q, float, f, 32, 4);
- 
- #undef CMT
- #define CMT " FP special (infinity, 0)"
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_fp3, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_fp3, CMT);
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_fp3, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_fp3, CMT);
- }
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrecpsh_f16_1.c
-@@ -0,0 +1,50 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+#define A 12.4
-+#define B -5.8
-+#define C -3.8
-+#define D 10
-+#define E 66.1
-+#define F 16.1
-+#define G -4.8
-+#define H -77
-+
-+#define I 0.7
-+#define J -78
-+#define K 10.23
-+#define L 98
-+#define M 87
-+#define N -87.81
-+#define O -1.1
-+#define P 47.8
-+
-+float16_t input_1[] = { A, B, C, D, I, J, K, L };
-+float16_t input_2[] = { E, F, G, H, M, N, O, P };
-+uint16_t expected[] = { 0xE264 /* 2.0f - A * E.  */,
-+			0x55F6 /* 2.0f - B * F.  */,
-+			0xCC10 /* 2.0f - C * G.  */,
-+			0x6208 /* 2.0f - D * H.  */,
-+			0xD35D /* 2.0f - I * M.  */,
-+			0xEEB0 /* 2.0f - J * N.  */,
-+			0x4A9F /* 2.0f - K * O.  */,
-+			0xEC93 /* 2.0f - L * P.  */ };
-+
-+#define TEST_MSG "VRECPSH_F16"
-+#define INSN_NAME vrecpsh_f16
-+
-+#define INPUT_1 input_1
-+#define INPUT_2 input_2
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "binary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrecpxh_f16_1.c
-@@ -0,0 +1,32 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+
-+float16_t input[] = { 123.4, 567.8, 34.8, 1024, 663.1, 144.0, 4.8, 77 };
-+/*  Expected results are calculated by:
-+  for (index = 0; index < 8; index++)
-+    {
-+      uint16_t src_cast = * (uint16_t *) &src[index];
-+      * (uint16_t *) &expected[index] =
-+	(src_cast & 0x8000) | (~src_cast & 0x7C00);
-+    }  */
-+uint16_t expected[8] = { 0x2800, 0x1C00, 0x2C00, 0x1800,
-+			 0x1C00, 0x2400, 0x3800, 0x2800 };
-+
-+#define TEST_MSG "VRECPXH_F16"
-+#define INSN_NAME vrecpxh_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vreinterpret.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vreinterpret.c
-@@ -21,6 +21,8 @@ VECT_VAR_DECL(expected_s8_8,int,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
- 					    0xf4, 0xf5, 0xf6, 0xf7 };
- VECT_VAR_DECL(expected_s8_9,int,8,8) [] = { 0xf0, 0xff, 0xf1, 0xff,
- 					    0xf2, 0xff, 0xf3, 0xff };
-+VECT_VAR_DECL(expected_s8_10,int,8,8) [] = { 0x00, 0xcc, 0x80, 0xcb,
-+					     0x00, 0xcb, 0x80, 0xca };
- 
- /* Expected results for vreinterpret_s16_xx.  */
- VECT_VAR_DECL(expected_s16_1,int,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
-@@ -32,6 +34,7 @@ VECT_VAR_DECL(expected_s16_6,int,16,4) [] = { 0xfff0, 0xffff, 0xfff1, 0xffff };
- VECT_VAR_DECL(expected_s16_7,int,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
- VECT_VAR_DECL(expected_s16_8,int,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
- VECT_VAR_DECL(expected_s16_9,int,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
-+VECT_VAR_DECL(expected_s16_10,int,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
- 
- /* Expected results for vreinterpret_s32_xx.  */
- VECT_VAR_DECL(expected_s32_1,int,32,2) [] = { 0xf3f2f1f0, 0xf7f6f5f4 };
-@@ -43,6 +46,7 @@ VECT_VAR_DECL(expected_s32_6,int,32,2) [] = { 0xfffffff0, 0xfffffff1 };
- VECT_VAR_DECL(expected_s32_7,int,32,2) [] = { 0xfffffff0, 0xffffffff };
- VECT_VAR_DECL(expected_s32_8,int,32,2) [] = { 0xf3f2f1f0, 0xf7f6f5f4 };
- VECT_VAR_DECL(expected_s32_9,int,32,2) [] = { 0xfff1fff0, 0xfff3fff2 };
-+VECT_VAR_DECL(expected_s32_10,int,32,2) [] = { 0xcb80cc00, 0xca80cb00 };
- 
- /* Expected results for vreinterpret_s64_xx.  */
- VECT_VAR_DECL(expected_s64_1,int,64,1) [] = { 0xf7f6f5f4f3f2f1f0 };
-@@ -54,6 +58,7 @@ VECT_VAR_DECL(expected_s64_6,int,64,1) [] = { 0xfffffff1fffffff0 };
- VECT_VAR_DECL(expected_s64_7,int,64,1) [] = { 0xfffffffffffffff0 };
- VECT_VAR_DECL(expected_s64_8,int,64,1) [] = { 0xf7f6f5f4f3f2f1f0 };
- VECT_VAR_DECL(expected_s64_9,int,64,1) [] = { 0xfff3fff2fff1fff0 };
-+VECT_VAR_DECL(expected_s64_10,int,64,1) [] = { 0xca80cb00cb80cc00 };
- 
- /* Expected results for vreinterpret_u8_xx.  */
- VECT_VAR_DECL(expected_u8_1,uint,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
-@@ -74,6 +79,8 @@ VECT_VAR_DECL(expected_u8_8,uint,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
- 					     0xf4, 0xf5, 0xf6, 0xf7 };
- VECT_VAR_DECL(expected_u8_9,uint,8,8) [] = { 0xf0, 0xff, 0xf1, 0xff,
- 					     0xf2, 0xff, 0xf3, 0xff };
-+VECT_VAR_DECL(expected_u8_10,uint,8,8) [] = { 0x00, 0xcc, 0x80, 0xcb,
-+					      0x00, 0xcb, 0x80, 0xca };
- 
- /* Expected results for vreinterpret_u16_xx.  */
- VECT_VAR_DECL(expected_u16_1,uint,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
-@@ -85,6 +92,7 @@ VECT_VAR_DECL(expected_u16_6,uint,16,4) [] = { 0xfff0, 0xffff, 0xfff1, 0xffff };
- VECT_VAR_DECL(expected_u16_7,uint,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
- VECT_VAR_DECL(expected_u16_8,uint,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
- VECT_VAR_DECL(expected_u16_9,uint,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
-+VECT_VAR_DECL(expected_u16_10,uint,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
- 
- /* Expected results for vreinterpret_u32_xx.  */
- VECT_VAR_DECL(expected_u32_1,uint,32,2) [] = { 0xf3f2f1f0, 0xf7f6f5f4 };
-@@ -96,6 +104,7 @@ VECT_VAR_DECL(expected_u32_6,uint,32,2) [] = { 0xfff1fff0, 0xfff3fff2 };
- VECT_VAR_DECL(expected_u32_7,uint,32,2) [] = { 0xfffffff0, 0xffffffff };
- VECT_VAR_DECL(expected_u32_8,uint,32,2) [] = { 0xf3f2f1f0, 0xf7f6f5f4 };
- VECT_VAR_DECL(expected_u32_9,uint,32,2) [] = { 0xfff1fff0, 0xfff3fff2 };
-+VECT_VAR_DECL(expected_u32_10,uint,32,2) [] = { 0xcb80cc00, 0xca80cb00 };
- 
- /* Expected results for vreinterpret_u64_xx.  */
- VECT_VAR_DECL(expected_u64_1,uint,64,1) [] = { 0xf7f6f5f4f3f2f1f0 };
-@@ -107,6 +116,7 @@ VECT_VAR_DECL(expected_u64_6,uint,64,1) [] = { 0xfff3fff2fff1fff0 };
- VECT_VAR_DECL(expected_u64_7,uint,64,1) [] = { 0xfffffff1fffffff0 };
- VECT_VAR_DECL(expected_u64_8,uint,64,1) [] = { 0xf7f6f5f4f3f2f1f0 };
- VECT_VAR_DECL(expected_u64_9,uint,64,1) [] = { 0xfff3fff2fff1fff0 };
-+VECT_VAR_DECL(expected_u64_10,uint,64,1) [] = { 0xca80cb00cb80cc00 };
- 
- /* Expected results for vreinterpret_p8_xx.  */
- VECT_VAR_DECL(expected_p8_1,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
-@@ -127,6 +137,8 @@ VECT_VAR_DECL(expected_p8_8,poly,8,8) [] = { 0xf0, 0xff, 0xff, 0xff,
- 					     0xff, 0xff, 0xff, 0xff };
- VECT_VAR_DECL(expected_p8_9,poly,8,8) [] = { 0xf0, 0xff, 0xf1, 0xff,
- 					     0xf2, 0xff, 0xf3, 0xff };
-+VECT_VAR_DECL(expected_p8_10,poly,8,8) [] = { 0x00, 0xcc, 0x80, 0xcb,
-+					      0x00, 0xcb, 0x80, 0xca };
- 
- /* Expected results for vreinterpret_p16_xx.  */
- VECT_VAR_DECL(expected_p16_1,poly,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
-@@ -138,6 +150,7 @@ VECT_VAR_DECL(expected_p16_6,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
- VECT_VAR_DECL(expected_p16_7,poly,16,4) [] = { 0xfff0, 0xffff, 0xfff1, 0xffff };
- VECT_VAR_DECL(expected_p16_8,poly,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
- VECT_VAR_DECL(expected_p16_9,poly,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
-+VECT_VAR_DECL(expected_p16_10,poly,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
- 
- /* Expected results for vreinterpretq_s8_xx.  */
- VECT_VAR_DECL(expected_q_s8_1,int,8,16) [] = { 0xf0, 0xff, 0xf1, 0xff,
-@@ -176,6 +189,10 @@ VECT_VAR_DECL(expected_q_s8_9,int,8,16) [] = { 0xf0, 0xff, 0xf1, 0xff,
- 					       0xf2, 0xff, 0xf3, 0xff,
- 					       0xf4, 0xff, 0xf5, 0xff,
- 					       0xf6, 0xff, 0xf7, 0xff };
-+VECT_VAR_DECL(expected_q_s8_10,int,8,16) [] = { 0x00, 0xcc, 0x80, 0xcb,
-+						0x00, 0xcb, 0x80, 0xca,
-+						0x00, 0xca, 0x80, 0xc9,
-+						0x00, 0xc9, 0x80, 0xc8 };
- 
- /* Expected results for vreinterpretq_s16_xx.  */
- VECT_VAR_DECL(expected_q_s16_1,int,16,8) [] = { 0xf1f0, 0xf3f2,
-@@ -214,6 +231,10 @@ VECT_VAR_DECL(expected_q_s16_9,int,16,8) [] = { 0xfff0, 0xfff1,
- 						0xfff2, 0xfff3,
- 						0xfff4, 0xfff5,
- 						0xfff6, 0xfff7 };
-+VECT_VAR_DECL(expected_q_s16_10,int,16,8) [] = { 0xcc00, 0xcb80,
-+						 0xcb00, 0xca80,
-+						 0xca00, 0xc980,
-+						 0xc900, 0xc880 };
- 
- /* Expected results for vreinterpretq_s32_xx.  */
- VECT_VAR_DECL(expected_q_s32_1,int,32,4) [] = { 0xf3f2f1f0, 0xf7f6f5f4,
-@@ -234,6 +255,8 @@ VECT_VAR_DECL(expected_q_s32_8,int,32,4) [] = { 0xf3f2f1f0, 0xf7f6f5f4,
- 						0xfbfaf9f8, 0xfffefdfc };
- VECT_VAR_DECL(expected_q_s32_9,int,32,4) [] = { 0xfff1fff0, 0xfff3fff2,
- 						0xfff5fff4, 0xfff7fff6 };
-+VECT_VAR_DECL(expected_q_s32_10,int,32,4) [] = { 0xcb80cc00, 0xca80cb00,
-+						 0xc980ca00, 0xc880c900 };
- 
- /* Expected results for vreinterpretq_s64_xx.  */
- VECT_VAR_DECL(expected_q_s64_1,int,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
-@@ -254,6 +277,8 @@ VECT_VAR_DECL(expected_q_s64_8,int,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
- 						0xfffefdfcfbfaf9f8 };
- VECT_VAR_DECL(expected_q_s64_9,int,64,2) [] = { 0xfff3fff2fff1fff0,
- 						0xfff7fff6fff5fff4 };
-+VECT_VAR_DECL(expected_q_s64_10,int,64,2) [] = { 0xca80cb00cb80cc00,
-+						 0xc880c900c980ca00 };
- 
- /* Expected results for vreinterpretq_u8_xx.  */
- VECT_VAR_DECL(expected_q_u8_1,uint,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
-@@ -292,6 +317,10 @@ VECT_VAR_DECL(expected_q_u8_9,uint,8,16) [] = { 0xf0, 0xff, 0xf1, 0xff,
- 						0xf2, 0xff, 0xf3, 0xff,
- 						0xf4, 0xff, 0xf5, 0xff,
- 						0xf6, 0xff, 0xf7, 0xff };
-+VECT_VAR_DECL(expected_q_u8_10,uint,8,16) [] = { 0x00, 0xcc, 0x80, 0xcb,
-+						 0x00, 0xcb, 0x80, 0xca,
-+						 0x00, 0xca, 0x80, 0xc9,
-+						 0x00, 0xc9, 0x80, 0xc8 };
- 
- /* Expected results for vreinterpretq_u16_xx.  */
- VECT_VAR_DECL(expected_q_u16_1,uint,16,8) [] = { 0xf1f0, 0xf3f2,
-@@ -330,6 +359,10 @@ VECT_VAR_DECL(expected_q_u16_9,uint,16,8) [] = { 0xfff0, 0xfff1,
- 						 0xfff2, 0xfff3,
- 						 0xfff4, 0xfff5,
- 						 0xfff6, 0xfff7 };
-+VECT_VAR_DECL(expected_q_u16_10,uint,16,8) [] = { 0xcc00, 0xcb80,
-+						  0xcb00, 0xca80,
-+						  0xca00, 0xc980,
-+						  0xc900, 0xc880 };
- 
- /* Expected results for vreinterpretq_u32_xx.  */
- VECT_VAR_DECL(expected_q_u32_1,uint,32,4) [] = { 0xf3f2f1f0, 0xf7f6f5f4,
-@@ -350,6 +383,8 @@ VECT_VAR_DECL(expected_q_u32_8,uint,32,4) [] = { 0xf3f2f1f0, 0xf7f6f5f4,
- 						 0xfbfaf9f8, 0xfffefdfc };
- VECT_VAR_DECL(expected_q_u32_9,uint,32,4) [] = { 0xfff1fff0, 0xfff3fff2,
- 						 0xfff5fff4, 0xfff7fff6 };
-+VECT_VAR_DECL(expected_q_u32_10,uint,32,4) [] = { 0xcb80cc00, 0xca80cb00,
-+						  0xc980ca00, 0xc880c900 };
- 
- /* Expected results for vreinterpretq_u64_xx.  */
- VECT_VAR_DECL(expected_q_u64_1,uint,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
-@@ -370,6 +405,92 @@ VECT_VAR_DECL(expected_q_u64_8,uint,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
- 						0xfffefdfcfbfaf9f8 };
- VECT_VAR_DECL(expected_q_u64_9,uint,64,2) [] = { 0xfff3fff2fff1fff0,
- 						 0xfff7fff6fff5fff4 };
-+VECT_VAR_DECL(expected_q_u64_10,uint,64,2) [] = { 0xca80cb00cb80cc00,
-+						  0xc880c900c980ca00 };
-+
-+/* Expected results for vreinterpretq_p8_xx.  */
-+VECT_VAR_DECL(expected_q_p8_1,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
-+						0xf4, 0xf5, 0xf6, 0xf7,
-+						0xf8, 0xf9, 0xfa, 0xfb,
-+						0xfc, 0xfd, 0xfe, 0xff };
-+VECT_VAR_DECL(expected_q_p8_2,poly,8,16) [] = { 0xf0, 0xff, 0xf1, 0xff,
-+						0xf2, 0xff, 0xf3, 0xff,
-+						0xf4, 0xff, 0xf5, 0xff,
-+						0xf6, 0xff, 0xf7, 0xff };
-+VECT_VAR_DECL(expected_q_p8_3,poly,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
-+						0xf1, 0xff, 0xff, 0xff,
-+						0xf2, 0xff, 0xff, 0xff,
-+						0xf3, 0xff, 0xff, 0xff };
-+VECT_VAR_DECL(expected_q_p8_4,poly,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
-+						0xff, 0xff, 0xff, 0xff,
-+						0xf1, 0xff, 0xff, 0xff,
-+						0xff, 0xff, 0xff, 0xff };
-+VECT_VAR_DECL(expected_q_p8_5,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
-+						0xf4, 0xf5, 0xf6, 0xf7,
-+						0xf8, 0xf9, 0xfa, 0xfb,
-+						0xfc, 0xfd, 0xfe, 0xff };
-+VECT_VAR_DECL(expected_q_p8_6,poly,8,16) [] = { 0xf0, 0xff, 0xf1, 0xff,
-+						0xf2, 0xff, 0xf3, 0xff,
-+						0xf4, 0xff, 0xf5, 0xff,
-+						0xf6, 0xff, 0xf7, 0xff };
-+VECT_VAR_DECL(expected_q_p8_7,poly,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
-+						0xf1, 0xff, 0xff, 0xff,
-+						0xf2, 0xff, 0xff, 0xff,
-+						0xf3, 0xff, 0xff, 0xff };
-+VECT_VAR_DECL(expected_q_p8_8,poly,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
-+						0xff, 0xff, 0xff, 0xff,
-+						0xf1, 0xff, 0xff, 0xff,
-+						0xff, 0xff, 0xff, 0xff };
-+VECT_VAR_DECL(expected_q_p8_9,poly,8,16) [] = { 0xf0, 0xff, 0xf1, 0xff,
-+						0xf2, 0xff, 0xf3, 0xff,
-+						0xf4, 0xff, 0xf5, 0xff,
-+						0xf6, 0xff, 0xf7, 0xff };
-+VECT_VAR_DECL(expected_q_p8_10,poly,8,16) [] = { 0x00, 0xcc, 0x80, 0xcb,
-+						 0x00, 0xcb, 0x80, 0xca,
-+						 0x00, 0xca, 0x80, 0xc9,
-+						 0x00, 0xc9, 0x80, 0xc8 };
-+
-+/* Expected results for vreinterpretq_p16_xx.  */
-+VECT_VAR_DECL(expected_q_p16_1,poly,16,8) [] = { 0xf1f0, 0xf3f2,
-+						 0xf5f4, 0xf7f6,
-+						 0xf9f8, 0xfbfa,
-+						 0xfdfc, 0xfffe };
-+VECT_VAR_DECL(expected_q_p16_2,poly,16,8) [] = { 0xfff0, 0xfff1,
-+						 0xfff2, 0xfff3,
-+						 0xfff4, 0xfff5,
-+						 0xfff6, 0xfff7 };
-+VECT_VAR_DECL(expected_q_p16_3,poly,16,8) [] = { 0xfff0, 0xffff,
-+						 0xfff1, 0xffff,
-+						 0xfff2, 0xffff,
-+						 0xfff3, 0xffff };
-+VECT_VAR_DECL(expected_q_p16_4,poly,16,8) [] = { 0xfff0, 0xffff,
-+						 0xffff, 0xffff,
-+						 0xfff1, 0xffff,
-+						 0xffff, 0xffff };
-+VECT_VAR_DECL(expected_q_p16_5,poly,16,8) [] = { 0xf1f0, 0xf3f2,
-+						 0xf5f4, 0xf7f6,
-+						 0xf9f8, 0xfbfa,
-+						 0xfdfc, 0xfffe };
-+VECT_VAR_DECL(expected_q_p16_6,poly,16,8) [] = { 0xfff0, 0xfff1,
-+						 0xfff2, 0xfff3,
-+						 0xfff4, 0xfff5,
-+						 0xfff6, 0xfff7 };
-+VECT_VAR_DECL(expected_q_p16_7,poly,16,8) [] = { 0xfff0, 0xffff,
-+						 0xfff1, 0xffff,
-+						 0xfff2, 0xffff,
-+						 0xfff3, 0xffff };
-+VECT_VAR_DECL(expected_q_p16_8,poly,16,8) [] = { 0xfff0, 0xffff,
-+						 0xffff, 0xffff,
-+						 0xfff1, 0xffff,
-+						 0xffff, 0xffff };
-+VECT_VAR_DECL(expected_q_p16_9,poly,16,8) [] = { 0xf1f0, 0xf3f2,
-+						 0xf5f4, 0xf7f6,
-+						 0xf9f8, 0xfbfa,
-+						 0xfdfc, 0xfffe };
-+VECT_VAR_DECL(expected_q_p16_10,poly,16,8) [] = { 0xcc00, 0xcb80,
-+						  0xcb00, 0xca80,
-+						  0xca00, 0xc980,
-+						  0xc900, 0xc880 };
- 
- /* Expected results for vreinterpret_f32_xx.  */
- VECT_VAR_DECL(expected_f32_1,hfloat,32,2) [] = { 0xf3f2f1f0, 0xf7f6f5f4 };
-@@ -382,6 +503,7 @@ VECT_VAR_DECL(expected_f32_7,hfloat,32,2) [] = { 0xfffffff0, 0xfffffff1 };
- VECT_VAR_DECL(expected_f32_8,hfloat,32,2) [] = { 0xfffffff0, 0xffffffff };
- VECT_VAR_DECL(expected_f32_9,hfloat,32,2) [] = { 0xf3f2f1f0, 0xf7f6f5f4 };
- VECT_VAR_DECL(expected_f32_10,hfloat,32,2) [] = { 0xfff1fff0, 0xfff3fff2 };
-+VECT_VAR_DECL(expected_f32_11,hfloat,32,2) [] = { 0xcb80cc00, 0xca80cb00 };
- 
- /* Expected results for vreinterpretq_f32_xx.  */
- VECT_VAR_DECL(expected_q_f32_1,hfloat,32,4) [] = { 0xf3f2f1f0, 0xf7f6f5f4,
-@@ -404,8 +526,10 @@ VECT_VAR_DECL(expected_q_f32_9,hfloat,32,4) [] = { 0xf3f2f1f0, 0xf7f6f5f4,
- 						   0xfbfaf9f8, 0xfffefdfc };
- VECT_VAR_DECL(expected_q_f32_10,hfloat,32,4) [] = { 0xfff1fff0, 0xfff3fff2,
- 						    0xfff5fff4, 0xfff7fff6 };
-+VECT_VAR_DECL(expected_q_f32_11,hfloat,32,4) [] = { 0xcb80cc00, 0xca80cb00,
-+						    0xc980ca00, 0xc880c900 };
- 
--/* Expected results for vreinterpretq_xx_f32.  */
-+/* Expected results for vreinterpret_xx_f32.  */
- VECT_VAR_DECL(expected_xx_f32_1,int,8,8) [] = { 0x0, 0x0, 0x80, 0xc1,
- 						0x0, 0x0, 0x70, 0xc1 };
- VECT_VAR_DECL(expected_xx_f32_2,int,16,4) [] = { 0x0, 0xc180, 0x0, 0xc170 };
-@@ -419,6 +543,7 @@ VECT_VAR_DECL(expected_xx_f32_8,uint,64,1) [] = { 0xc1700000c1800000 };
- VECT_VAR_DECL(expected_xx_f32_9,poly,8,8) [] = { 0x0, 0x0, 0x80, 0xc1,
- 						 0x0, 0x0, 0x70, 0xc1 };
- VECT_VAR_DECL(expected_xx_f32_10,poly,16,4) [] = { 0x0, 0xc180, 0x0, 0xc170 };
-+VECT_VAR_DECL(expected_xx_f32_11,hfloat,16,4) [] = { 0x0, 0xc180, 0x0, 0xc170 };
- 
- /* Expected results for vreinterpretq_xx_f32.  */
- VECT_VAR_DECL(expected_q_xx_f32_1,int,8,16) [] = { 0x0, 0x0, 0x80, 0xc1,
-@@ -447,6 +572,62 @@ VECT_VAR_DECL(expected_q_xx_f32_9,poly,8,16) [] = { 0x0, 0x0, 0x80, 0xc1,
- 						    0x0, 0x0, 0x50, 0xc1 };
- VECT_VAR_DECL(expected_q_xx_f32_10,poly,16,8) [] = { 0x0, 0xc180, 0x0, 0xc170,
- 						     0x0, 0xc160, 0x0, 0xc150 };
-+VECT_VAR_DECL(expected_q_xx_f32_11,hfloat,16,8) [] = { 0x0, 0xc180, 0x0, 0xc170,
-+						      0x0, 0xc160, 0x0, 0xc150 };
-+
-+/* Expected results for vreinterpret_f16_xx.  */
-+VECT_VAR_DECL(expected_f16_1,hfloat,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
-+VECT_VAR_DECL(expected_f16_2,hfloat,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
-+VECT_VAR_DECL(expected_f16_3,hfloat,16,4) [] = { 0xfff0, 0xffff, 0xfff1, 0xffff };
-+VECT_VAR_DECL(expected_f16_4,hfloat,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
-+VECT_VAR_DECL(expected_f16_5,hfloat,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
-+VECT_VAR_DECL(expected_f16_6,hfloat,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
-+VECT_VAR_DECL(expected_f16_7,hfloat,16,4) [] = { 0xfff0, 0xffff, 0xfff1, 0xffff };
-+VECT_VAR_DECL(expected_f16_8,hfloat,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
-+VECT_VAR_DECL(expected_f16_9,hfloat,16,4) [] = { 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6 };
-+VECT_VAR_DECL(expected_f16_10,hfloat,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
-+
-+/* Expected results for vreinterpretq_f16_xx.  */
-+VECT_VAR_DECL(expected_q_f16_1,hfloat,16,8) [] = { 0xf1f0, 0xf3f2,
-+						   0xf5f4, 0xf7f6,
-+						   0xf9f8, 0xfbfa,
-+						   0xfdfc, 0xfffe };
-+VECT_VAR_DECL(expected_q_f16_2,hfloat,16,8) [] = { 0xfff0, 0xfff1,
-+						   0xfff2, 0xfff3,
-+						   0xfff4, 0xfff5,
-+						   0xfff6, 0xfff7 };
-+VECT_VAR_DECL(expected_q_f16_3,hfloat,16,8) [] = { 0xfff0, 0xffff,
-+						   0xfff1, 0xffff,
-+						   0xfff2, 0xffff,
-+						   0xfff3, 0xffff };
-+VECT_VAR_DECL(expected_q_f16_4,hfloat,16,8) [] = { 0xfff0, 0xffff,
-+						   0xffff, 0xffff,
-+						   0xfff1, 0xffff,
-+						   0xffff, 0xffff };
-+VECT_VAR_DECL(expected_q_f16_5,hfloat,16,8) [] = { 0xf1f0, 0xf3f2,
-+						   0xf5f4, 0xf7f6,
-+						   0xf9f8, 0xfbfa,
-+						   0xfdfc, 0xfffe };
-+VECT_VAR_DECL(expected_q_f16_6,hfloat,16,8) [] = { 0xfff0, 0xfff1,
-+						   0xfff2, 0xfff3,
-+						   0xfff4, 0xfff5,
-+						   0xfff6, 0xfff7 };
-+VECT_VAR_DECL(expected_q_f16_7,hfloat,16,8) [] = { 0xfff0, 0xffff,
-+						   0xfff1, 0xffff,
-+						   0xfff2, 0xffff,
-+						   0xfff3, 0xffff };
-+VECT_VAR_DECL(expected_q_f16_8,hfloat,16,8) [] = { 0xfff0, 0xffff,
-+						   0xffff, 0xffff,
-+						   0xfff1, 0xffff,
-+						   0xffff, 0xffff };
-+VECT_VAR_DECL(expected_q_f16_9,hfloat,16,8) [] = { 0xf1f0, 0xf3f2,
-+						   0xf5f4, 0xf7f6,
-+						   0xf9f8, 0xfbfa,
-+						   0xfdfc, 0xfffe };
-+VECT_VAR_DECL(expected_q_f16_10,hfloat,16,8) [] = { 0xfff0, 0xfff1,
-+						    0xfff2, 0xfff3,
-+						    0xfff4, 0xfff5,
-+						    0xfff6, 0xfff7 };
- 
- #define TEST_MSG "VREINTERPRET/VREINTERPRETQ"
- 
-@@ -484,6 +665,10 @@ void exec_vreinterpret (void)
- 
-   /* Initialize input "vector" from "buffer".  */
-   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  VLOAD(vector, buffer, , float, f, 16, 4);
-+  VLOAD(vector, buffer, q, float, f, 16, 8);
-+#endif
-   VLOAD(vector, buffer, , float, f, 32, 2);
-   VLOAD(vector, buffer, q, float, f, 32, 4);
- 
-@@ -497,6 +682,9 @@ void exec_vreinterpret (void)
-   TEST_VREINTERPRET(, int, s, 8, 8, uint, u, 64, 1, expected_s8_7);
-   TEST_VREINTERPRET(, int, s, 8, 8, poly, p, 8, 8, expected_s8_8);
-   TEST_VREINTERPRET(, int, s, 8, 8, poly, p, 16, 4, expected_s8_9);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET(, int, s, 8, 8, float, f, 16, 4, expected_s8_10);
-+#endif
- 
-   /* vreinterpret_s16_xx.  */
-   TEST_VREINTERPRET(, int, s, 16, 4, int, s, 8, 8, expected_s16_1);
-@@ -508,6 +696,9 @@ void exec_vreinterpret (void)
-   TEST_VREINTERPRET(, int, s, 16, 4, uint, u, 64, 1, expected_s16_7);
-   TEST_VREINTERPRET(, int, s, 16, 4, poly, p, 8, 8, expected_s16_8);
-   TEST_VREINTERPRET(, int, s, 16, 4, poly, p, 16, 4, expected_s16_9);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET(, int, s, 16, 4, float, f, 16, 4, expected_s16_10);
-+#endif
- 
-   /* vreinterpret_s32_xx.  */
-   TEST_VREINTERPRET(, int, s, 32, 2, int, s, 8, 8, expected_s32_1);
-@@ -519,6 +710,9 @@ void exec_vreinterpret (void)
-   TEST_VREINTERPRET(, int, s, 32, 2, uint, u, 64, 1, expected_s32_7);
-   TEST_VREINTERPRET(, int, s, 32, 2, poly, p, 8, 8, expected_s32_8);
-   TEST_VREINTERPRET(, int, s, 32, 2, poly, p, 16, 4, expected_s32_9);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET(, int, s, 32, 2, float, f, 16, 4, expected_s32_10);
-+#endif
- 
-   /* vreinterpret_s64_xx.  */
-   TEST_VREINTERPRET(, int, s, 64, 1, int, s, 8, 8, expected_s64_1);
-@@ -530,6 +724,9 @@ void exec_vreinterpret (void)
-   TEST_VREINTERPRET(, int, s, 64, 1, uint, u, 64, 1, expected_s64_7);
-   TEST_VREINTERPRET(, int, s, 64, 1, poly, p, 8, 8, expected_s64_8);
-   TEST_VREINTERPRET(, int, s, 64, 1, poly, p, 16, 4, expected_s64_9);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET(, int, s, 64, 1, float, f, 16, 4, expected_s64_10);
-+#endif
- 
-   /* vreinterpret_u8_xx.  */
-   TEST_VREINTERPRET(, uint, u, 8, 8, int, s, 8, 8, expected_u8_1);
-@@ -541,6 +738,9 @@ void exec_vreinterpret (void)
-   TEST_VREINTERPRET(, uint, u, 8, 8, uint, u, 64, 1, expected_u8_7);
-   TEST_VREINTERPRET(, uint, u, 8, 8, poly, p, 8, 8, expected_u8_8);
-   TEST_VREINTERPRET(, uint, u, 8, 8, poly, p, 16, 4, expected_u8_9);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET(, uint, u, 8, 8, float, f, 16, 4, expected_u8_10);
-+#endif
- 
-   /* vreinterpret_u16_xx.  */
-   TEST_VREINTERPRET(, uint, u, 16, 4, int, s, 8, 8, expected_u16_1);
-@@ -552,6 +752,9 @@ void exec_vreinterpret (void)
-   TEST_VREINTERPRET(, uint, u, 16, 4, uint, u, 64, 1, expected_u16_7);
-   TEST_VREINTERPRET(, uint, u, 16, 4, poly, p, 8, 8, expected_u16_8);
-   TEST_VREINTERPRET(, uint, u, 16, 4, poly, p, 16, 4, expected_u16_9);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET(, uint, u, 16, 4, float, f, 16, 4, expected_u16_10);
-+#endif
- 
-   /* vreinterpret_u32_xx.  */
-   TEST_VREINTERPRET(, uint, u, 32, 2, int, s, 8, 8, expected_u32_1);
-@@ -563,6 +766,9 @@ void exec_vreinterpret (void)
-   TEST_VREINTERPRET(, uint, u, 32, 2, uint, u, 64, 1, expected_u32_7);
-   TEST_VREINTERPRET(, uint, u, 32, 2, poly, p, 8, 8, expected_u32_8);
-   TEST_VREINTERPRET(, uint, u, 32, 2, poly, p, 16, 4, expected_u32_9);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET(, uint, u, 32, 2, float, f, 16, 4, expected_u32_10);
-+#endif
- 
-   /* vreinterpret_u64_xx.  */
-   TEST_VREINTERPRET(, uint, u, 64, 1, int, s, 8, 8, expected_u64_1);
-@@ -574,6 +780,9 @@ void exec_vreinterpret (void)
-   TEST_VREINTERPRET(, uint, u, 64, 1, uint, u, 32, 2, expected_u64_7);
-   TEST_VREINTERPRET(, uint, u, 64, 1, poly, p, 8, 8, expected_u64_8);
-   TEST_VREINTERPRET(, uint, u, 64, 1, poly, p, 16, 4, expected_u64_9);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET(, uint, u, 64, 1, float, f, 16, 4, expected_u64_10);
-+#endif
- 
-   /* vreinterpret_p8_xx.  */
-   TEST_VREINTERPRET_POLY(, poly, p, 8, 8, int, s, 8, 8, expected_p8_1);
-@@ -585,6 +794,9 @@ void exec_vreinterpret (void)
-   TEST_VREINTERPRET_POLY(, poly, p, 8, 8, uint, u, 32, 2, expected_p8_7);
-   TEST_VREINTERPRET_POLY(, poly, p, 8, 8, uint, u, 64, 1, expected_p8_8);
-   TEST_VREINTERPRET_POLY(, poly, p, 8, 8, poly, p, 16, 4, expected_p8_9);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET_POLY(, poly, p, 8, 8, float, f, 16, 4, expected_p8_10);
-+#endif
- 
-   /* vreinterpret_p16_xx.  */
-   TEST_VREINTERPRET_POLY(, poly, p, 16, 4, int, s, 8, 8, expected_p16_1);
-@@ -596,6 +808,9 @@ void exec_vreinterpret (void)
-   TEST_VREINTERPRET_POLY(, poly, p, 16, 4, uint, u, 32, 2, expected_p16_7);
-   TEST_VREINTERPRET_POLY(, poly, p, 16, 4, uint, u, 64, 1, expected_p16_8);
-   TEST_VREINTERPRET_POLY(, poly, p, 16, 4, poly, p, 8, 8, expected_p16_9);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET_POLY(, poly, p, 16, 4, float, f, 16, 4, expected_p16_10);
-+#endif
- 
-   /* vreinterpretq_s8_xx.  */
-   TEST_VREINTERPRET(q, int, s, 8, 16, int, s, 16, 8, expected_q_s8_1);
-@@ -607,6 +822,9 @@ void exec_vreinterpret (void)
-   TEST_VREINTERPRET(q, int, s, 8, 16, uint, u, 64, 2, expected_q_s8_7);
-   TEST_VREINTERPRET(q, int, s, 8, 16, poly, p, 8, 16, expected_q_s8_8);
-   TEST_VREINTERPRET(q, int, s, 8, 16, poly, p, 16, 8, expected_q_s8_9);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET(q, int, s, 8, 16, float, f, 16, 8, expected_q_s8_10);
-+#endif
- 
-   /* vreinterpretq_s16_xx.  */
-   TEST_VREINTERPRET(q, int, s, 16, 8, int, s, 8, 16, expected_q_s16_1);
-@@ -618,6 +836,9 @@ void exec_vreinterpret (void)
-   TEST_VREINTERPRET(q, int, s, 16, 8, uint, u, 64, 2, expected_q_s16_7);
-   TEST_VREINTERPRET(q, int, s, 16, 8, poly, p, 8, 16, expected_q_s16_8);
-   TEST_VREINTERPRET(q, int, s, 16, 8, poly, p, 16, 8, expected_q_s16_9);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET(q, int, s, 16, 8, float, f, 16, 8, expected_q_s16_10);
-+#endif
- 
-   /* vreinterpretq_s32_xx.  */
-   TEST_VREINTERPRET(q, int, s, 32, 4, int, s, 8, 16, expected_q_s32_1);
-@@ -629,6 +850,9 @@ void exec_vreinterpret (void)
-   TEST_VREINTERPRET(q, int, s, 32, 4, uint, u, 64, 2, expected_q_s32_7);
-   TEST_VREINTERPRET(q, int, s, 32, 4, poly, p, 8, 16, expected_q_s32_8);
-   TEST_VREINTERPRET(q, int, s, 32, 4, poly, p, 16, 8, expected_q_s32_9);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET(q, int, s, 32, 4, float, f, 16, 8, expected_q_s32_10);
-+#endif
- 
-   /* vreinterpretq_s64_xx.  */
-   TEST_VREINTERPRET(q, int, s, 64, 2, int, s, 8, 16, expected_q_s64_1);
-@@ -640,6 +864,9 @@ void exec_vreinterpret (void)
-   TEST_VREINTERPRET(q, int, s, 64, 2, uint, u, 64, 2, expected_q_s64_7);
-   TEST_VREINTERPRET(q, int, s, 64, 2, poly, p, 8, 16, expected_q_s64_8);
-   TEST_VREINTERPRET(q, int, s, 64, 2, poly, p, 16, 8, expected_q_s64_9);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET(q, int, s, 64, 2, float, f, 16, 8, expected_q_s64_10);
-+#endif
- 
-   /* vreinterpretq_u8_xx.  */
-   TEST_VREINTERPRET(q, uint, u, 8, 16, int, s, 8, 16, expected_q_u8_1);
-@@ -651,6 +878,9 @@ void exec_vreinterpret (void)
-   TEST_VREINTERPRET(q, uint, u, 8, 16, uint, u, 64, 2, expected_q_u8_7);
-   TEST_VREINTERPRET(q, uint, u, 8, 16, poly, p, 8, 16, expected_q_u8_8);
-   TEST_VREINTERPRET(q, uint, u, 8, 16, poly, p, 16, 8, expected_q_u8_9);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET(q, uint, u, 8, 16, float, f, 16, 8, expected_q_u8_10);
-+#endif
- 
-   /* vreinterpretq_u16_xx.  */
-   TEST_VREINTERPRET(q, uint, u, 16, 8, int, s, 8, 16, expected_q_u16_1);
-@@ -662,6 +892,9 @@ void exec_vreinterpret (void)
-   TEST_VREINTERPRET(q, uint, u, 16, 8, uint, u, 64, 2, expected_q_u16_7);
-   TEST_VREINTERPRET(q, uint, u, 16, 8, poly, p, 8, 16, expected_q_u16_8);
-   TEST_VREINTERPRET(q, uint, u, 16, 8, poly, p, 16, 8, expected_q_u16_9);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET(q, uint, u, 16, 8, float, f, 16, 8, expected_q_u16_10);
-+#endif
- 
-   /* vreinterpretq_u32_xx.  */
-   TEST_VREINTERPRET(q, uint, u, 32, 4, int, s, 8, 16, expected_q_u32_1);
-@@ -673,6 +906,9 @@ void exec_vreinterpret (void)
-   TEST_VREINTERPRET(q, uint, u, 32, 4, uint, u, 64, 2, expected_q_u32_7);
-   TEST_VREINTERPRET(q, uint, u, 32, 4, poly, p, 8, 16, expected_q_u32_8);
-   TEST_VREINTERPRET(q, uint, u, 32, 4, poly, p, 16, 8, expected_q_u32_9);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET(q, uint, u, 32, 4, float, f, 16, 8, expected_q_u32_10);
-+#endif
- 
-   /* vreinterpretq_u64_xx.  */
-   TEST_VREINTERPRET(q, uint, u, 64, 2, int, s, 8, 16, expected_q_u64_1);
-@@ -684,6 +920,37 @@ void exec_vreinterpret (void)
-   TEST_VREINTERPRET(q, uint, u, 64, 2, uint, u, 32, 4, expected_q_u64_7);
-   TEST_VREINTERPRET(q, uint, u, 64, 2, poly, p, 8, 16, expected_q_u64_8);
-   TEST_VREINTERPRET(q, uint, u, 64, 2, poly, p, 16, 8, expected_q_u64_9);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET(q, uint, u, 64, 2, float, f, 16, 8, expected_q_u64_10);
-+#endif
-+
-+  /* vreinterpretq_p8_xx.  */
-+  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, int, s, 8, 16, expected_q_p8_1);
-+  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, int, s, 16, 8, expected_q_p8_2);
-+  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, int, s, 32, 4, expected_q_p8_3);
-+  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, int, s, 64, 2, expected_q_p8_4);
-+  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, uint, u, 8, 16, expected_q_p8_5);
-+  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, uint, u, 16, 8, expected_q_p8_6);
-+  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, uint, u, 32, 4, expected_q_p8_7);
-+  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, uint, u, 64, 2, expected_q_p8_8);
-+  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, poly, p, 16, 8, expected_q_p8_9);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, float, f, 16, 8, expected_q_p8_10);
-+#endif
-+
-+  /* vreinterpretq_p16_xx.  */
-+  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, int, s, 8, 16, expected_q_p16_1);
-+  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, int, s, 16, 8, expected_q_p16_2);
-+  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, int, s, 32, 4, expected_q_p16_3);
-+  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, int, s, 64, 2, expected_q_p16_4);
-+  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, uint, u, 8, 16, expected_q_p16_5);
-+  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, uint, u, 16, 8, expected_q_p16_6);
-+  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, uint, u, 32, 4, expected_q_p16_7);
-+  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, uint, u, 64, 2, expected_q_p16_8);
-+  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, poly, p, 8, 16, expected_q_p16_9);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, float, f, 16, 8, expected_q_p16_10);
-+#endif
- 
-   /* vreinterpret_f32_xx.  */
-   TEST_VREINTERPRET_FP(, float, f, 32, 2, int, s, 8, 8, expected_f32_1);
-@@ -696,6 +963,9 @@ void exec_vreinterpret (void)
-   TEST_VREINTERPRET_FP(, float, f, 32, 2, uint, u, 64, 1, expected_f32_8);
-   TEST_VREINTERPRET_FP(, float, f, 32, 2, poly, p, 8, 8, expected_f32_9);
-   TEST_VREINTERPRET_FP(, float, f, 32, 2, poly, p, 16, 4, expected_f32_10);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET_FP(, float, f, 32, 2, float, f, 16, 4, expected_f32_11);
-+#endif
- 
-   /* vreinterpretq_f32_xx.  */
-   TEST_VREINTERPRET_FP(q, float, f, 32, 4, int, s, 8, 16, expected_q_f32_1);
-@@ -708,6 +978,9 @@ void exec_vreinterpret (void)
-   TEST_VREINTERPRET_FP(q, float, f, 32, 4, uint, u, 64, 2, expected_q_f32_8);
-   TEST_VREINTERPRET_FP(q, float, f, 32, 4, poly, p, 8, 16, expected_q_f32_9);
-   TEST_VREINTERPRET_FP(q, float, f, 32, 4, poly, p, 16, 8, expected_q_f32_10);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET_FP(q, float, f, 32, 4, float, f, 16, 8, expected_q_f32_11);
-+#endif
- 
-   /* vreinterpret_xx_f32.  */
-   TEST_VREINTERPRET(, int, s, 8, 8, float, f, 32, 2, expected_xx_f32_1);
-@@ -720,6 +993,9 @@ void exec_vreinterpret (void)
-   TEST_VREINTERPRET(, uint, u, 64, 1, float, f, 32, 2, expected_xx_f32_8);
-   TEST_VREINTERPRET_POLY(, poly, p, 8, 8, float, f, 32, 2, expected_xx_f32_9);
-   TEST_VREINTERPRET_POLY(, poly, p, 16, 4, float, f, 32, 2, expected_xx_f32_10);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET_FP(, float, f, 16, 4, float, f, 32, 2, expected_xx_f32_11);
-+#endif
- 
-   /* vreinterpretq_xx_f32.  */
-   TEST_VREINTERPRET(q, int, s, 8, 16, float, f, 32, 4, expected_q_xx_f32_1);
-@@ -732,6 +1008,33 @@ void exec_vreinterpret (void)
-   TEST_VREINTERPRET(q, uint, u, 64, 2, float, f, 32, 4, expected_q_xx_f32_8);
-   TEST_VREINTERPRET_POLY(q, poly, p, 8, 16, float, f, 32, 4, expected_q_xx_f32_9);
-   TEST_VREINTERPRET_POLY(q, poly, p, 16, 8, float, f, 32, 4, expected_q_xx_f32_10);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET_FP(q, float, f, 16, 8, float, f, 32, 4, expected_q_xx_f32_11);
-+
-+  /* vreinterpret_f16_xx.  */
-+  TEST_VREINTERPRET_FP(, float, f, 16, 4, int, s, 8, 8, expected_f16_1);
-+  TEST_VREINTERPRET_FP(, float, f, 16, 4, int, s, 16, 4, expected_f16_2);
-+  TEST_VREINTERPRET_FP(, float, f, 16, 4, int, s, 32, 2, expected_f16_3);
-+  TEST_VREINTERPRET_FP(, float, f, 16, 4, int, s, 64, 1, expected_f16_4);
-+  TEST_VREINTERPRET_FP(, float, f, 16, 4, uint, u, 8, 8, expected_f16_5);
-+  TEST_VREINTERPRET_FP(, float, f, 16, 4, uint, u, 16, 4, expected_f16_6);
-+  TEST_VREINTERPRET_FP(, float, f, 16, 4, uint, u, 32, 2, expected_f16_7);
-+  TEST_VREINTERPRET_FP(, float, f, 16, 4, uint, u, 64, 1, expected_f16_8);
-+  TEST_VREINTERPRET_FP(, float, f, 16, 4, poly, p, 8, 8, expected_f16_9);
-+  TEST_VREINTERPRET_FP(, float, f, 16, 4, poly, p, 16, 4, expected_f16_10);
-+
-+  /* vreinterpretq_f16_xx.  */
-+  TEST_VREINTERPRET_FP(q, float, f, 16, 8, int, s, 8, 16, expected_q_f16_1);
-+  TEST_VREINTERPRET_FP(q, float, f, 16, 8, int, s, 16, 8, expected_q_f16_2);
-+  TEST_VREINTERPRET_FP(q, float, f, 16, 8, int, s, 32, 4, expected_q_f16_3);
-+  TEST_VREINTERPRET_FP(q, float, f, 16, 8, int, s, 64, 2, expected_q_f16_4);
-+  TEST_VREINTERPRET_FP(q, float, f, 16, 8, uint, u, 8, 16, expected_q_f16_5);
-+  TEST_VREINTERPRET_FP(q, float, f, 16, 8, uint, u, 16, 8, expected_q_f16_6);
-+  TEST_VREINTERPRET_FP(q, float, f, 16, 8, uint, u, 32, 4, expected_q_f16_7);
-+  TEST_VREINTERPRET_FP(q, float, f, 16, 8, uint, u, 64, 2, expected_q_f16_8);
-+  TEST_VREINTERPRET_FP(q, float, f, 16, 8, poly, p, 8, 16, expected_q_f16_9);
-+  TEST_VREINTERPRET_FP(q, float, f, 16, 8, poly, p, 16, 8, expected_q_f16_10);
-+#endif
- }
- 
- int main (void)
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vreinterpret_p128.c
-@@ -0,0 +1,165 @@
-+/* This file contains tests for the vreinterpret *p128 intrinsics.  */
-+
-+/* { dg-require-effective-target arm_crypto_ok { target { arm*-*-* } } } */
-+/* { dg-add-options arm_crypto } */
-+/* { dg-additional-options "-march=armv8-a+crypto" { target { aarch64*-*-* } } }*/
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+/* Expected results: vreinterpretq_p128_*.  */
-+VECT_VAR_DECL(vreint_expected_q_p128_s8,poly,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
-+							  0xfffefdfcfbfaf9f8 };
-+VECT_VAR_DECL(vreint_expected_q_p128_s16,poly,64,2) [] = { 0xfff3fff2fff1fff0,
-+							   0xfff7fff6fff5fff4 };
-+VECT_VAR_DECL(vreint_expected_q_p128_s32,poly,64,2) [] = { 0xfffffff1fffffff0,
-+							   0xfffffff3fffffff2 };
-+VECT_VAR_DECL(vreint_expected_q_p128_s64,poly,64,2) [] = { 0xfffffffffffffff0,
-+							   0xfffffffffffffff1 };
-+VECT_VAR_DECL(vreint_expected_q_p128_u8,poly,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
-+							  0xfffefdfcfbfaf9f8 };
-+VECT_VAR_DECL(vreint_expected_q_p128_u16,poly,64,2) [] = { 0xfff3fff2fff1fff0,
-+							   0xfff7fff6fff5fff4 };
-+VECT_VAR_DECL(vreint_expected_q_p128_u32,poly,64,2) [] = { 0xfffffff1fffffff0,
-+							   0xfffffff3fffffff2 };
-+VECT_VAR_DECL(vreint_expected_q_p128_u64,poly,64,2) [] = { 0xfffffffffffffff0,
-+							   0xfffffffffffffff1 };
-+VECT_VAR_DECL(vreint_expected_q_p128_p8,poly,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
-+							  0xfffefdfcfbfaf9f8 };
-+VECT_VAR_DECL(vreint_expected_q_p128_p16,poly,64,2) [] = { 0xfff3fff2fff1fff0,
-+							   0xfff7fff6fff5fff4 };
-+VECT_VAR_DECL(vreint_expected_q_p128_f32,poly,64,2) [] = { 0xc1700000c1800000,
-+							   0xc1500000c1600000 };
-+VECT_VAR_DECL(vreint_expected_q_p128_f16,poly,64,2) [] = { 0xca80cb00cb80cc00,
-+							   0xc880c900c980ca00 };
-+
-+/* Expected results: vreinterpretq_*_p128.  */
-+VECT_VAR_DECL(vreint_expected_q_s8_p128,int,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
-+							 0xff, 0xff, 0xff, 0xff,
-+							 0xf1, 0xff, 0xff, 0xff,
-+							 0xff, 0xff, 0xff, 0xff };
-+VECT_VAR_DECL(vreint_expected_q_s16_p128,int,16,8) [] = { 0xfff0, 0xffff,
-+							  0xffff, 0xffff,
-+							  0xfff1, 0xffff,
-+							  0xffff, 0xffff };
-+VECT_VAR_DECL(vreint_expected_q_s32_p128,int,32,4) [] = { 0xfffffff0, 0xffffffff,
-+							  0xfffffff1, 0xffffffff };
-+VECT_VAR_DECL(vreint_expected_q_s64_p128,int,64,2) [] = { 0xfffffffffffffff0,
-+							  0xfffffffffffffff1 };
-+VECT_VAR_DECL(vreint_expected_q_u8_p128,uint,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
-+							  0xff, 0xff, 0xff, 0xff,
-+							  0xf1, 0xff, 0xff, 0xff,
-+							  0xff, 0xff, 0xff, 0xff };
-+VECT_VAR_DECL(vreint_expected_q_u16_p128,uint,16,8) [] = { 0xfff0, 0xffff,
-+							   0xffff, 0xffff,
-+							   0xfff1, 0xffff,
-+							   0xffff, 0xffff };
-+VECT_VAR_DECL(vreint_expected_q_u32_p128,uint,32,4) [] = { 0xfffffff0, 0xffffffff,
-+							   0xfffffff1, 0xffffffff };
-+VECT_VAR_DECL(vreint_expected_q_u64_p128,uint,64,2) [] = { 0xfffffffffffffff0,
-+							   0xfffffffffffffff1 };
-+VECT_VAR_DECL(vreint_expected_q_p8_p128,poly,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
-+							  0xff, 0xff, 0xff, 0xff,
-+							  0xf1, 0xff, 0xff, 0xff,
-+							  0xff, 0xff, 0xff, 0xff };
-+VECT_VAR_DECL(vreint_expected_q_p16_p128,poly,16,8) [] = { 0xfff0, 0xffff,
-+							   0xffff, 0xffff,
-+							   0xfff1, 0xffff,
-+							   0xffff, 0xffff };
-+VECT_VAR_DECL(vreint_expected_q_p64_p128,uint,64,2) [] = { 0xfffffffffffffff0,
-+							   0xfffffffffffffff1 };
-+VECT_VAR_DECL(vreint_expected_q_f32_p128,hfloat,32,4) [] = { 0xfffffff0, 0xffffffff,
-+							     0xfffffff1, 0xffffffff };
-+VECT_VAR_DECL(vreint_expected_q_f16_p128,hfloat,16,8) [] = { 0xfff0, 0xffff,
-+							     0xffff, 0xffff,
-+							     0xfff1, 0xffff,
-+							     0xffff, 0xffff };
-+
-+int main (void)
-+{
-+  DECL_VARIABLE_128BITS_VARIANTS(vreint_vector);
-+  DECL_VARIABLE_128BITS_VARIANTS(vreint_vector_res);
-+
-+  clean_results ();
-+
-+  TEST_MACRO_128BITS_VARIANTS_2_5(VLOAD, vreint_vector, buffer);
-+  VLOAD(vreint_vector, buffer, q, poly, p, 64, 2);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  VLOAD(vreint_vector, buffer, q, float, f, 16, 8);
-+#endif
-+  VLOAD(vreint_vector, buffer, q, float, f, 32, 4);
-+
-+  /* vreinterpretq_p128_* tests.  */
-+#undef TEST_MSG
-+#define TEST_MSG "VREINTERPRETQ_P128_*"
-+
-+  /* Since there is no way to store a poly128_t value, convert to
-+     poly64x2_t before storing. This means that we are not able to
-+     test vreinterpretq_p128* alone, and that errors in
-+     vreinterpretq_p64_p128 could compensate for errors in
-+     vreinterpretq_p128*.  */
-+#define TEST_VREINTERPRET128(Q, T1, T2, W, N, TS1, TS2, WS, NS, EXPECTED) \
-+  VECT_VAR(vreint_vector_res, poly, 64, 2) =  vreinterpretq_p64_p128(	\
-+    vreinterpret##Q##_##T2##W##_##TS2##WS(VECT_VAR(vreint_vector, TS1, WS, NS))); \
-+  vst1##Q##_##T2##64(VECT_VAR(result, poly, 64, 2),			\
-+                     VECT_VAR(vreint_vector_res, poly, 64, 2));		\
-+  CHECK_POLY(TEST_MSG, T1, 64, 2, PRIx##64, EXPECTED, "");
-+
-+  TEST_VREINTERPRET128(q, poly, p, 128, 1, int, s, 8, 16, vreint_expected_q_p128_s8);
-+  TEST_VREINTERPRET128(q, poly, p, 128, 1, int, s, 16, 8, vreint_expected_q_p128_s16);
-+  TEST_VREINTERPRET128(q, poly, p, 128, 1, int, s, 32, 4, vreint_expected_q_p128_s32);
-+  TEST_VREINTERPRET128(q, poly, p, 128, 1, int, s, 64, 2, vreint_expected_q_p128_s64);
-+  TEST_VREINTERPRET128(q, poly, p, 128, 1, uint, u, 8, 16, vreint_expected_q_p128_u8);
-+  TEST_VREINTERPRET128(q, poly, p, 128, 1, uint, u, 16, 8, vreint_expected_q_p128_u16);
-+  TEST_VREINTERPRET128(q, poly, p, 128, 1, uint, u, 32, 4, vreint_expected_q_p128_u32);
-+  TEST_VREINTERPRET128(q, poly, p, 128, 1, uint, u, 64, 2, vreint_expected_q_p128_u64);
-+  TEST_VREINTERPRET128(q, poly, p, 128, 1, poly, p, 8, 16, vreint_expected_q_p128_p8);
-+  TEST_VREINTERPRET128(q, poly, p, 128, 1, poly, p, 16, 8, vreint_expected_q_p128_p16);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET128(q, poly, p, 128, 1, float, f, 16, 8, vreint_expected_q_p128_f16);
-+#endif
-+  TEST_VREINTERPRET128(q, poly, p, 128, 1, float, f, 32, 4, vreint_expected_q_p128_f32);
-+
-+  /* vreinterpretq_*_p128 tests.  */
-+#undef TEST_MSG
-+#define TEST_MSG "VREINTERPRETQ_*_P128"
-+
-+  /* Since there is no way to load a poly128_t value, load a
-+     poly64x2_t and convert it to poly128_t. This means that we are
-+     not able to test vreinterpretq_*_p128 alone, and that errors in
-+     vreinterpretq_p128_p64 could compensate for errors in
-+     vreinterpretq_*_p128*.  */
-+#define TEST_VREINTERPRET_FROM_P128(Q, T1, T2, W, N, TS1, TS2, WS, NS, EXPECTED) \
-+  VECT_VAR(vreint_vector_res, T1, W, N) =				\
-+    vreinterpret##Q##_##T2##W##_##TS2##WS(				\
-+  vreinterpretq_p128_p64(VECT_VAR(vreint_vector, TS1, 64, 2)));		\
-+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),				\
-+		    VECT_VAR(vreint_vector_res, T1, W, N));		\
-+  CHECK(TEST_MSG, T1, W, N, PRIx##W, EXPECTED, "");
-+
-+#define TEST_VREINTERPRET_FP_FROM_P128(Q, T1, T2, W, N, TS1, TS2, WS, NS, EXPECTED) \
-+  VECT_VAR(vreint_vector_res, T1, W, N) =				\
-+    vreinterpret##Q##_##T2##W##_##TS2##WS(				\
-+  vreinterpretq_p128_p64(VECT_VAR(vreint_vector, TS1, 64, 2)));		\
-+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),				\
-+		    VECT_VAR(vreint_vector_res, T1, W, N));		\
-+  CHECK_FP(TEST_MSG, T1, W, N, PRIx##W, EXPECTED, "");
-+
-+  TEST_VREINTERPRET_FROM_P128(q, int, s, 8, 16, poly, p, 128, 1, vreint_expected_q_s8_p128);
-+  TEST_VREINTERPRET_FROM_P128(q, int, s, 16, 8, poly, p, 128, 1, vreint_expected_q_s16_p128);
-+  TEST_VREINTERPRET_FROM_P128(q, int, s, 32, 4, poly, p, 128, 1, vreint_expected_q_s32_p128);
-+  TEST_VREINTERPRET_FROM_P128(q, int, s, 64, 2, poly, p, 128, 1, vreint_expected_q_s64_p128);
-+  TEST_VREINTERPRET_FROM_P128(q, uint, u, 8, 16, poly, p, 128, 1, vreint_expected_q_u8_p128);
-+  TEST_VREINTERPRET_FROM_P128(q, uint, u, 16, 8, poly, p, 128, 1, vreint_expected_q_u16_p128);
-+  TEST_VREINTERPRET_FROM_P128(q, uint, u, 32, 4, poly, p, 128, 1, vreint_expected_q_u32_p128);
-+  TEST_VREINTERPRET_FROM_P128(q, uint, u, 64, 2, poly, p, 128, 1, vreint_expected_q_u64_p128);
-+  TEST_VREINTERPRET_FROM_P128(q, poly, p, 8, 16, poly, p, 128, 1, vreint_expected_q_p8_p128);
-+  TEST_VREINTERPRET_FROM_P128(q, poly, p, 16, 8, poly, p, 128, 1, vreint_expected_q_p16_p128);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET_FP_FROM_P128(q, float, f, 16, 8, poly, p, 128, 1, vreint_expected_q_f16_p128);
-+#endif
-+  TEST_VREINTERPRET_FP_FROM_P128(q, float, f, 32, 4, poly, p, 128, 1, vreint_expected_q_f32_p128);
-+
-+  return 0;
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vreinterpret_p64.c
-@@ -0,0 +1,216 @@
-+/* This file contains tests for the vreinterpret *p64 intrinsics.  */
-+
-+/* { dg-require-effective-target arm_crypto_ok { target { arm*-*-* } } } */
-+/* { dg-add-options arm_crypto } */
-+/* { dg-additional-options "-march=armv8-a+crypto" { target { aarch64*-*-* } } }*/
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+/* Expected results: vreinterpret_p64_*.  */
-+VECT_VAR_DECL(vreint_expected_p64_s8,poly,64,1) [] = { 0xf7f6f5f4f3f2f1f0 };
-+VECT_VAR_DECL(vreint_expected_p64_s16,poly,64,1) [] = { 0xfff3fff2fff1fff0 };
-+VECT_VAR_DECL(vreint_expected_p64_s32,poly,64,1) [] = { 0xfffffff1fffffff0 };
-+VECT_VAR_DECL(vreint_expected_p64_s64,poly,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(vreint_expected_p64_u8,poly,64,1) [] = { 0xf7f6f5f4f3f2f1f0 };
-+VECT_VAR_DECL(vreint_expected_p64_u16,poly,64,1) [] = { 0xfff3fff2fff1fff0 };
-+VECT_VAR_DECL(vreint_expected_p64_u32,poly,64,1) [] = { 0xfffffff1fffffff0 };
-+VECT_VAR_DECL(vreint_expected_p64_u64,poly,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(vreint_expected_p64_p8,poly,64,1) [] = { 0xf7f6f5f4f3f2f1f0 };
-+VECT_VAR_DECL(vreint_expected_p64_p16,poly,64,1) [] = { 0xfff3fff2fff1fff0 };
-+VECT_VAR_DECL(vreint_expected_p64_f32,poly,64,1) [] = { 0xc1700000c1800000 };
-+VECT_VAR_DECL(vreint_expected_p64_f16,poly,64,1) [] = { 0xca80cb00cb80cc00 };
-+
-+/* Expected results: vreinterpretq_p64_*.  */
-+VECT_VAR_DECL(vreint_expected_q_p64_s8,poly,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
-+							 0xfffefdfcfbfaf9f8 };
-+VECT_VAR_DECL(vreint_expected_q_p64_s16,poly,64,2) [] = { 0xfff3fff2fff1fff0,
-+							  0xfff7fff6fff5fff4 };
-+VECT_VAR_DECL(vreint_expected_q_p64_s32,poly,64,2) [] = { 0xfffffff1fffffff0,
-+							  0xfffffff3fffffff2 };
-+VECT_VAR_DECL(vreint_expected_q_p64_s64,poly,64,2) [] = { 0xfffffffffffffff0,
-+							  0xfffffffffffffff1 };
-+VECT_VAR_DECL(vreint_expected_q_p64_u8,poly,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
-+							 0xfffefdfcfbfaf9f8 };
-+VECT_VAR_DECL(vreint_expected_q_p64_u16,poly,64,2) [] = { 0xfff3fff2fff1fff0,
-+							  0xfff7fff6fff5fff4 };
-+VECT_VAR_DECL(vreint_expected_q_p64_u32,poly,64,2) [] = { 0xfffffff1fffffff0,
-+							  0xfffffff3fffffff2 };
-+VECT_VAR_DECL(vreint_expected_q_p64_u64,poly,64,2) [] = { 0xfffffffffffffff0,
-+							  0xfffffffffffffff1 };
-+VECT_VAR_DECL(vreint_expected_q_p64_p8,poly,64,2) [] = { 0xf7f6f5f4f3f2f1f0,
-+							 0xfffefdfcfbfaf9f8 };
-+VECT_VAR_DECL(vreint_expected_q_p64_p16,poly,64,2) [] = { 0xfff3fff2fff1fff0,
-+							  0xfff7fff6fff5fff4 };
-+VECT_VAR_DECL(vreint_expected_q_p64_f32,poly,64,2) [] = { 0xc1700000c1800000,
-+							  0xc1500000c1600000 };
-+VECT_VAR_DECL(vreint_expected_q_p64_f16,poly,64,2) [] = { 0xca80cb00cb80cc00,
-+							  0xc880c900c980ca00 };
-+
-+/* Expected results: vreinterpret_*_p64.  */
-+VECT_VAR_DECL(vreint_expected_s8_p64,int,8,8) [] = { 0xf0, 0xff, 0xff, 0xff,
-+						     0xff, 0xff, 0xff, 0xff };
-+VECT_VAR_DECL(vreint_expected_s16_p64,int,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
-+VECT_VAR_DECL(vreint_expected_s32_p64,int,32,2) [] = { 0xfffffff0, 0xffffffff };
-+VECT_VAR_DECL(vreint_expected_s64_p64,int,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(vreint_expected_u8_p64,uint,8,8) [] = { 0xf0, 0xff, 0xff, 0xff,
-+						      0xff, 0xff, 0xff, 0xff };
-+VECT_VAR_DECL(vreint_expected_u16_p64,uint,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
-+VECT_VAR_DECL(vreint_expected_u32_p64,uint,32,2) [] = { 0xfffffff0, 0xffffffff };
-+VECT_VAR_DECL(vreint_expected_u64_p64,uint,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(vreint_expected_p8_p64,poly,8,8) [] = { 0xf0, 0xff, 0xff, 0xff,
-+						      0xff, 0xff, 0xff, 0xff };
-+VECT_VAR_DECL(vreint_expected_p16_p64,poly,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
-+VECT_VAR_DECL(vreint_expected_f32_p64,hfloat,32,2) [] = { 0xfffffff0, 0xffffffff };
-+VECT_VAR_DECL(vreint_expected_f16_p64,hfloat,16,4) [] = { 0xfff0, 0xffff, 0xffff, 0xffff };
-+
-+/* Expected results: vreinterpretq_*_p64.  */
-+VECT_VAR_DECL(vreint_expected_q_s8_p64,int,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
-+							0xff, 0xff, 0xff, 0xff,
-+							0xf1, 0xff, 0xff, 0xff,
-+							0xff, 0xff, 0xff, 0xff };
-+VECT_VAR_DECL(vreint_expected_q_s16_p64,int,16,8) [] = { 0xfff0, 0xffff,
-+							 0xffff, 0xffff,
-+							 0xfff1, 0xffff,
-+							 0xffff, 0xffff };
-+VECT_VAR_DECL(vreint_expected_q_s32_p64,int,32,4) [] = { 0xfffffff0, 0xffffffff,
-+							 0xfffffff1, 0xffffffff };
-+VECT_VAR_DECL(vreint_expected_q_s64_p64,int,64,2) [] = { 0xfffffffffffffff0,
-+							 0xfffffffffffffff1 };
-+VECT_VAR_DECL(vreint_expected_q_u8_p64,uint,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
-+							 0xff, 0xff, 0xff, 0xff,
-+							 0xf1, 0xff, 0xff, 0xff,
-+							 0xff, 0xff, 0xff, 0xff };
-+VECT_VAR_DECL(vreint_expected_q_u16_p64,uint,16,8) [] = { 0xfff0, 0xffff,
-+							  0xffff, 0xffff,
-+							  0xfff1, 0xffff,
-+							  0xffff, 0xffff };
-+VECT_VAR_DECL(vreint_expected_q_u32_p64,uint,32,4) [] = { 0xfffffff0, 0xffffffff,
-+							  0xfffffff1, 0xffffffff };
-+VECT_VAR_DECL(vreint_expected_q_u64_p64,uint,64,2) [] = { 0xfffffffffffffff0,
-+							  0xfffffffffffffff1 };
-+VECT_VAR_DECL(vreint_expected_q_p8_p64,poly,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
-+							 0xff, 0xff, 0xff, 0xff,
-+							 0xf1, 0xff, 0xff, 0xff,
-+							 0xff, 0xff, 0xff, 0xff };
-+VECT_VAR_DECL(vreint_expected_q_p16_p64,poly,16,8) [] = { 0xfff0, 0xffff,
-+							  0xffff, 0xffff,
-+							  0xfff1, 0xffff,
-+							  0xffff, 0xffff };
-+VECT_VAR_DECL(vreint_expected_q_f32_p64,hfloat,32,4) [] = { 0xfffffff0, 0xffffffff,
-+							    0xfffffff1, 0xffffffff };
-+VECT_VAR_DECL(vreint_expected_q_f16_p64,hfloat,16,8) [] = { 0xfff0, 0xffff,
-+							    0xffff, 0xffff,
-+							    0xfff1, 0xffff,
-+							    0xffff, 0xffff };
-+
-+int main (void)
-+{
-+#define TEST_VREINTERPRET(Q, T1, T2, W, N, TS1, TS2, WS, NS, EXPECTED)	\
-+  VECT_VAR(vreint_vector_res, T1, W, N) =				\
-+    vreinterpret##Q##_##T2##W##_##TS2##WS(VECT_VAR(vreint_vector, TS1, WS, NS)); \
-+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),				\
-+		    VECT_VAR(vreint_vector_res, T1, W, N));		\
-+  CHECK(TEST_MSG, T1, W, N, PRIx##W, EXPECTED, "");
-+
-+#define TEST_VREINTERPRET_TO_POLY(Q, T1, T2, W, N, TS1, TS2, WS, NS, EXPECTED)	\
-+  VECT_VAR(vreint_vector_res, T1, W, N) =				\
-+    vreinterpret##Q##_##T2##W##_##TS2##WS(VECT_VAR(vreint_vector, TS1, WS, NS)); \
-+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),				\
-+		    VECT_VAR(vreint_vector_res, T1, W, N));		\
-+  CHECK_POLY(TEST_MSG, T1, W, N, PRIx##W, EXPECTED, "");
-+
-+#define TEST_VREINTERPRET_FP(Q, T1, T2, W, N, TS1, TS2, WS, NS, EXPECTED) \
-+  VECT_VAR(vreint_vector_res, T1, W, N) =				\
-+    vreinterpret##Q##_##T2##W##_##TS2##WS(VECT_VAR(vreint_vector, TS1, WS, NS)); \
-+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),				\
-+		    VECT_VAR(vreint_vector_res, T1, W, N));		\
-+  CHECK_FP(TEST_MSG, T1, W, N, PRIx##W, EXPECTED, "");
-+
-+  DECL_VARIABLE_ALL_VARIANTS(vreint_vector);
-+  DECL_VARIABLE_ALL_VARIANTS(vreint_vector_res);
-+
-+  clean_results ();
-+
-+  TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vreint_vector, buffer);
-+  VLOAD(vreint_vector, buffer, , poly, p, 64, 1);
-+  VLOAD(vreint_vector, buffer, q, poly, p, 64, 2);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  VLOAD(vreint_vector, buffer, , float, f, 16, 4);
-+  VLOAD(vreint_vector, buffer, q, float, f, 16, 8);
-+#endif
-+  VLOAD(vreint_vector, buffer, , float, f, 32, 2);
-+  VLOAD(vreint_vector, buffer, q, float, f, 32, 4);
-+
-+  /* vreinterpret_p64_* tests.  */
-+#undef TEST_MSG
-+#define TEST_MSG "VREINTERPRET_P64_*"
-+  TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, int, s, 8, 8, vreint_expected_p64_s8);
-+  TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, int, s, 16, 4, vreint_expected_p64_s16);
-+  TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, int, s, 32, 2, vreint_expected_p64_s32);
-+  TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, int, s, 64, 1, vreint_expected_p64_s64);
-+  TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, uint, u, 8, 8, vreint_expected_p64_u8);
-+  TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, uint, u, 16, 4, vreint_expected_p64_u16);
-+  TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, uint, u, 32, 2, vreint_expected_p64_u32);
-+  TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, uint, u, 64, 1, vreint_expected_p64_u64);
-+  TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, poly, p, 8, 8, vreint_expected_p64_p8);
-+  TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, poly, p, 16, 4, vreint_expected_p64_p16);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, float, f, 16, 4, vreint_expected_p64_f16);
-+#endif
-+  TEST_VREINTERPRET_TO_POLY(, poly, p, 64, 1, float, f, 32, 2, vreint_expected_p64_f32);
-+
-+  /* vreinterpretq_p64_* tests.  */
-+#undef TEST_MSG
-+#define TEST_MSG "VREINTERPRETQ_P64_*"
-+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, int, s, 8, 16, vreint_expected_q_p64_s8);
-+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, int, s, 16, 8, vreint_expected_q_p64_s16);
-+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, int, s, 32, 4, vreint_expected_q_p64_s32);
-+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, int, s, 64, 2, vreint_expected_q_p64_s64);
-+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, uint, u, 8, 16, vreint_expected_q_p64_u8);
-+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, uint, u, 16, 8, vreint_expected_q_p64_u16);
-+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, uint, u, 32, 4, vreint_expected_q_p64_u32);
-+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, uint, u, 64, 2, vreint_expected_q_p64_u64);
-+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, poly, p, 8, 16, vreint_expected_q_p64_p8);
-+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, poly, p, 16, 8, vreint_expected_q_p64_p16);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, float, f, 16, 8, vreint_expected_q_p64_f16);
-+#endif
-+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 64, 2, float, f, 32, 4, vreint_expected_q_p64_f32);
-+
-+  /* vreinterpret_*_p64 tests.  */
-+#undef TEST_MSG
-+#define TEST_MSG "VREINTERPRET_*_P64"
-+
-+  TEST_VREINTERPRET(, int, s, 8, 8, poly, p, 64, 1, vreint_expected_s8_p64);
-+  TEST_VREINTERPRET(, int, s, 16, 4, poly, p, 64, 1, vreint_expected_s16_p64);
-+  TEST_VREINTERPRET(, int, s, 32, 2, poly, p, 64, 1, vreint_expected_s32_p64);
-+  TEST_VREINTERPRET(, int, s, 64, 1, poly, p, 64, 1, vreint_expected_s64_p64);
-+  TEST_VREINTERPRET(, uint, u, 8, 8, poly, p, 64, 1, vreint_expected_u8_p64);
-+  TEST_VREINTERPRET(, uint, u, 16, 4, poly, p, 64, 1, vreint_expected_u16_p64);
-+  TEST_VREINTERPRET(, uint, u, 32, 2, poly, p, 64, 1, vreint_expected_u32_p64);
-+  TEST_VREINTERPRET(, uint, u, 64, 1, poly, p, 64, 1, vreint_expected_u64_p64);
-+  TEST_VREINTERPRET_TO_POLY(, poly, p, 8, 8, poly, p, 64, 1, vreint_expected_p8_p64);
-+  TEST_VREINTERPRET_TO_POLY(, poly, p, 16, 4, poly, p, 64, 1, vreint_expected_p16_p64);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET_FP(, float, f, 16, 4, poly, p, 64, 1, vreint_expected_f16_p64);
-+#endif
-+  TEST_VREINTERPRET_FP(, float, f, 32, 2, poly, p, 64, 1, vreint_expected_f32_p64);
-+  TEST_VREINTERPRET(q, int, s, 8, 16, poly, p, 64, 2, vreint_expected_q_s8_p64);
-+  TEST_VREINTERPRET(q, int, s, 16, 8, poly, p, 64, 2, vreint_expected_q_s16_p64);
-+  TEST_VREINTERPRET(q, int, s, 32, 4, poly, p, 64, 2, vreint_expected_q_s32_p64);
-+  TEST_VREINTERPRET(q, int, s, 64, 2, poly, p, 64, 2, vreint_expected_q_s64_p64);
-+  TEST_VREINTERPRET(q, uint, u, 8, 16, poly, p, 64, 2, vreint_expected_q_u8_p64);
-+  TEST_VREINTERPRET(q, uint, u, 16, 8, poly, p, 64, 2, vreint_expected_q_u16_p64);
-+  TEST_VREINTERPRET(q, uint, u, 32, 4, poly, p, 64, 2, vreint_expected_q_u32_p64);
-+  TEST_VREINTERPRET(q, uint, u, 64, 2, poly, p, 64, 2, vreint_expected_q_u64_p64);
-+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 8, 16, poly, p, 64, 2, vreint_expected_q_p8_p64);
-+  TEST_VREINTERPRET_TO_POLY(q, poly, p, 16, 8, poly, p, 64, 2, vreint_expected_q_p16_p64);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  TEST_VREINTERPRET_FP(q, float, f, 16, 8, poly, p, 64, 2, vreint_expected_q_f16_p64);
-+#endif
-+  TEST_VREINTERPRET_FP(q, float, f, 32, 4, poly, p, 64, 2, vreint_expected_q_f32_p64);
-+
-+  return 0;
-+}
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrev.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrev.c
-@@ -63,6 +63,10 @@ VECT_VAR_DECL(expected_vrev64,uint,32,2) [] = { 0xfffffff1, 0xfffffff0 };
- VECT_VAR_DECL(expected_vrev64,poly,8,8) [] = { 0xf7, 0xf6, 0xf5, 0xf4,
- 					       0xf3, 0xf2, 0xf1, 0xf0 };
- VECT_VAR_DECL(expected_vrev64,poly,16,4) [] = { 0xfff3, 0xfff2, 0xfff1, 0xfff0 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected_vrev64, hfloat, 16, 4) [] = { 0xca80, 0xcb00,
-+						      0xcb80, 0xcc00 };
-+#endif
- VECT_VAR_DECL(expected_vrev64,hfloat,32,2) [] = { 0xc1700000, 0xc1800000 };
- VECT_VAR_DECL(expected_vrev64,int,8,16) [] = { 0xf7, 0xf6, 0xf5, 0xf4,
- 					       0xf3, 0xf2, 0xf1, 0xf0,
-@@ -86,6 +90,12 @@ VECT_VAR_DECL(expected_vrev64,poly,8,16) [] = { 0xf7, 0xf6, 0xf5, 0xf4,
- 						0xfb, 0xfa, 0xf9, 0xf8 };
- VECT_VAR_DECL(expected_vrev64,poly,16,8) [] = { 0xfff3, 0xfff2, 0xfff1, 0xfff0,
- 						0xfff7, 0xfff6, 0xfff5, 0xfff4 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected_vrev64, hfloat, 16, 8) [] = { 0xca80, 0xcb00,
-+						      0xcb80, 0xcc00,
-+						      0xc880, 0xc900,
-+						      0xc980, 0xca00 };
-+#endif
- VECT_VAR_DECL(expected_vrev64,hfloat,32,4) [] = { 0xc1700000, 0xc1800000,
- 						  0xc1500000, 0xc1600000 };
- 
-@@ -104,6 +114,10 @@ void exec_vrev (void)
- 
-   /* Initialize input "vector" from "buffer".  */
-   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
-+#if defined (FP16_SUPPORTED)
-+  VLOAD (vector, buffer, , float, f, 16, 4);
-+  VLOAD (vector, buffer, q, float, f, 16, 8);
-+#endif
-   VLOAD(vector, buffer, , float, f, 32, 2);
-   VLOAD(vector, buffer, q, float, f, 32, 4);
- 
-@@ -118,10 +132,10 @@ void exec_vrev (void)
- 
-   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vrev16, "");
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vrev16, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vrev16, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vrev16, "");
-   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vrev16, "");
-   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vrev16, "");
--  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vrev16, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vrev16, "");
- 
- #undef TEST_MSG
- #define TEST_MSG "VREV32"
-@@ -142,14 +156,14 @@ void exec_vrev (void)
-   CHECK(TEST_MSG, int, 16, 4, PRIx16, expected_vrev32, "");
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vrev32, "");
-   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_vrev32, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vrev32, "");
--  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_vrev32, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vrev32, "");
-+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_vrev32, "");
-   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vrev32, "");
-   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_vrev32, "");
-   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vrev32, "");
-   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_vrev32, "");
--  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vrev32, "");
--  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_vrev32, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vrev32, "");
-+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_vrev32, "");
- 
- #undef TEST_MSG
- #define TEST_MSG "VREV64"
-@@ -176,17 +190,23 @@ void exec_vrev (void)
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vrev64, "");
-   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_vrev64, "");
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_vrev64, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vrev64, "");
--  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_vrev64, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vrev64, "");
-+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_vrev64, "");
-   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_vrev64, "");
-   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_vrev64, "");
-   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_vrev64, "");
-   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_vrev64, "");
-   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_vrev64, "");
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_vrev64, "");
--  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_vrev64, "");
--  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_vrev64, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_vrev64, "");
-+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_vrev64, "");
- 
-+#if defined (FP16_SUPPORTED)
-+  TEST_VREV (, float, f, 16, 4, 64);
-+  TEST_VREV (q, float, f, 16, 8, 64);
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx32, expected_vrev64, "");
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx32, expected_vrev64, "");
-+#endif
-   TEST_VREV(, float, f, 32, 2, 64);
-   TEST_VREV(q, float, f, 32, 4, 64);
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_vrev64, "");
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrnd.c
-@@ -0,0 +1,24 @@
-+/* { dg-require-effective-target arm_v8_neon_hw } */
-+/* { dg-add-options arm_v8_neon } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+/* Expected results.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcc00, 0xcb80,
-+					       0xcb00, 0xca80 };
-+VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xcc00, 0xcb80,
-+					       0xcb00, 0xca80,
-+					       0xca00, 0xc980,
-+					       0xc900, 0xc880 };
-+#endif
-+VECT_VAR_DECL (expected, hfloat, 32, 2) [] = { 0xc1800000, 0xc1700000 };
-+VECT_VAR_DECL (expected, hfloat, 32, 4) [] = { 0xc1800000, 0xc1700000,
-+					       0xc1600000, 0xc1500000 };
-+
-+#define INSN vrnd
-+#define TEST_MSG "VRND"
-+
-+#include "vrndX.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndX.inc
-@@ -0,0 +1,63 @@
-+#define FNNAME1(NAME) exec_ ## NAME
-+#define FNNAME(NAME) FNNAME1 (NAME)
-+
-+void FNNAME (INSN) (void)
-+{
-+  /* vector_res = vrndX (vector), then store the result.  */
-+#define TEST_VRND2(INSN, Q, T1, T2, W, N)				\
-+  VECT_VAR (vector_res, T1, W, N) =					\
-+    INSN##Q##_##T2##W (VECT_VAR (vector, T1, W, N));			\
-+    vst1##Q##_##T2##W (VECT_VAR (result, T1, W, N),			\
-+		       VECT_VAR (vector_res, T1, W, N))
-+
-+  /* Two auxliary macros are necessary to expand INSN.  */
-+#define TEST_VRND1(INSN, Q, T1, T2, W, N)	\
-+  TEST_VRND2 (INSN, Q, T1, T2, W, N)
-+
-+#define TEST_VRND(Q, T1, T2, W, N)		\
-+  TEST_VRND1 (INSN, Q, T1, T2, W, N)
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector, float, 16, 4);
-+  DECL_VARIABLE(vector, float, 16, 8);
-+#endif
-+  DECL_VARIABLE (vector, float, 32, 2);
-+  DECL_VARIABLE (vector, float, 32, 4);
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector_res, float, 16, 4);
-+  DECL_VARIABLE(vector_res, float, 16, 8);
-+#endif
-+  DECL_VARIABLE (vector_res, float, 32, 2);
-+  DECL_VARIABLE (vector_res, float, 32, 4);
-+
-+  clean_results ();
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VLOAD (vector, buffer, , float, f, 16, 4);
-+  VLOAD (vector, buffer, q, float, f, 16, 8);
-+#endif
-+  VLOAD (vector, buffer, , float, f, 32, 2);
-+  VLOAD (vector, buffer, q, float, f, 32, 4);
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VRND ( , float, f, 16, 4);
-+  TEST_VRND (q, float, f, 16, 8);
-+#endif
-+  TEST_VRND ( , float, f, 32, 2);
-+  TEST_VRND (q, float, f, 32, 4);
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected, "");
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected, "");
-+#endif
-+  CHECK_FP (TEST_MSG, float, 32, 2, PRIx32, expected, "");
-+  CHECK_FP (TEST_MSG, float, 32, 4, PRIx32, expected, "");
-+}
-+
-+int
-+main (void)
-+{
-+  FNNAME (INSN) ();
-+  return 0;
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrnda.c
-@@ -0,0 +1,25 @@
-+/* { dg-require-effective-target arm_v8_neon_hw } */
-+/* { dg-add-options arm_v8_neon } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+/* Expected results.  */
-+/* Expected results.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcc00, 0xcb80,
-+					       0xcb00, 0xca80 };
-+VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xcc00, 0xcb80,
-+					       0xcb00, 0xca80,
-+					       0xca00, 0xc980,
-+					       0xc900, 0xc880 };
-+#endif
-+VECT_VAR_DECL (expected, hfloat, 32, 2) [] = { 0xc1800000, 0xc1700000 };
-+VECT_VAR_DECL (expected, hfloat, 32, 4) [] = { 0xc1800000, 0xc1700000,
-+					       0xc1600000, 0xc1500000 };
-+
-+#define INSN vrnda
-+#define TEST_MSG "VRNDA"
-+
-+#include "vrndX.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndah_f16_1.c
-@@ -0,0 +1,40 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected[] =
-+{
-+  0x0000 /* 0.000000 */,
-+  0x8000 /* -0.000000 */,
-+  0x4000 /* 2.000000 */,
-+  0x4200 /* 3.000000 */,
-+  0x4d00 /* 20.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0xc000 /* -2.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0xc800 /* -8.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0x4a80 /* 13.000000 */,
-+  0xc600 /* -6.000000 */,
-+  0x4d00 /* 20.000000 */,
-+  0x7c00 /* inf */,
-+  0xfc00 /* -inf */
-+};
-+
-+#define TEST_MSG "VRNDAH_F16"
-+#define INSN_NAME vrndah_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndh_f16_1.c
-@@ -0,0 +1,40 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected[] =
-+{
-+  0x0000 /* 0.000000 */,
-+  0x8000 /* -0.000000 */,
-+  0x4000 /* 2.000000 */,
-+  0x4200 /* 3.000000 */,
-+  0x4d00 /* 20.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0xc000 /* -2.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0xc700 /* -7.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0x4a80 /* 13.000000 */,
-+  0xc600 /* -6.000000 */,
-+  0x4d00 /* 20.000000 */,
-+  0x7c00 /* inf */,
-+  0xfc00 /* -inf */
-+};
-+
-+#define TEST_MSG "VRNDH_F16"
-+#define INSN_NAME vrndh_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndi_f16_1.c
-@@ -0,0 +1,71 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_neon } */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+#define FP16_C(a) ((__fp16) a)
-+#define A FP16_C (123.4)
-+#define RNDI_A 0x57B0 /* FP16_C (123).  */
-+#define B FP16_C (-567.5)
-+#define RNDI_B 0xE070 /* FP16_C (-568).  */
-+#define C FP16_C (-34.8)
-+#define RNDI_C 0xD060 /* FP16_C (-35).  */
-+#define D FP16_C (1024)
-+#define RNDI_D 0x6400 /* FP16_C (1024).  */
-+#define E FP16_C (663.1)
-+#define RNDI_E 0x612E /* FP16_C (663).  */
-+#define F FP16_C (169.1)
-+#define RNDI_F 0x5948 /* FP16_C (169).  */
-+#define G FP16_C (-4.8)
-+#define RNDI_G 0xC500 /* FP16_C (-5).  */
-+#define H FP16_C (77.5)
-+#define RNDI_H 0x54E0 /* FP16_C (78).  */
-+
-+/* Expected results for vrndi.  */
-+VECT_VAR_DECL (expected_static, hfloat, 16, 4) []
-+  = { RNDI_A, RNDI_B, RNDI_C, RNDI_D };
-+
-+VECT_VAR_DECL (expected_static, hfloat, 16, 8) []
-+  = { RNDI_A, RNDI_B, RNDI_C, RNDI_D, RNDI_E, RNDI_F, RNDI_G, RNDI_H };
-+
-+void exec_vrndi_f16 (void)
-+{
-+#undef TEST_MSG
-+#define TEST_MSG "VRNDI (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc, float, 16, 4);
-+  VECT_VAR_DECL (buf_src, float, 16, 4) [] = {A, B, C, D};
-+  VLOAD (vsrc, buf_src, , float, f, 16, 4);
-+  DECL_VARIABLE (vector_res, float, 16, 4)
-+    = vrndi_f16 (VECT_VAR (vsrc, float, 16, 4));
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VRNDIQ (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc, float, 16, 8);
-+  VECT_VAR_DECL (buf_src, float, 16, 8) [] = {A, B, C, D, E, F, G, H};
-+  VLOAD (vsrc, buf_src, q, float, f, 16, 8);
-+  DECL_VARIABLE (vector_res, float, 16, 8)
-+    = vrndiq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_static, "");
-+}
-+
-+int
-+main (void)
-+{
-+  exec_vrndi_f16 ();
-+  return 0;
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndih_f16_1.c
-@@ -0,0 +1,40 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected[] =
-+{
-+  0x0000 /* 0.000000 */,
-+  0x8000 /* -0.000000 */,
-+  0x4000 /* 2.000000 */,
-+  0x4200 /* 3.000000 */,
-+  0x4d00 /* 20.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0xc000 /* -2.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0xc800 /* -8.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0x4a80 /* 13.000000 */,
-+  0xc600 /* -6.000000 */,
-+  0x4d00 /* 20.000000 */,
-+  0x7c00 /* inf */,
-+  0xfc00 /* -inf */
-+};
-+
-+#define TEST_MSG "VRNDIH_F16"
-+#define INSN_NAME vrndih_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndm.c
-@@ -0,0 +1,25 @@
-+/* { dg-require-effective-target arm_v8_neon_hw } */
-+/* { dg-add-options arm_v8_neon } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+/* Expected results.  */
-+/* Expected results.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcc00, 0xcb80,
-+					       0xcb00, 0xca80 };
-+VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xcc00, 0xcb80,
-+					       0xcb00, 0xca80,
-+					       0xca00, 0xc980,
-+					       0xc900, 0xc880 };
-+#endif
-+VECT_VAR_DECL (expected, hfloat, 32, 2) [] = { 0xc1800000, 0xc1700000 };
-+VECT_VAR_DECL (expected, hfloat, 32, 4) [] = { 0xc1800000, 0xc1700000,
-+					       0xc1600000, 0xc1500000 };
-+
-+#define INSN vrndm
-+#define TEST_MSG "VRNDM"
-+
-+#include "vrndX.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndmh_f16_1.c
-@@ -0,0 +1,40 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected[] =
-+{
-+  0x0000 /* 0.000000 */,
-+  0x8000 /* -0.000000 */,
-+  0x4000 /* 2.000000 */,
-+  0x4200 /* 3.000000 */,
-+  0x4d00 /* 20.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0xc200 /* -3.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0xc800 /* -8.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0x4a80 /* 13.000000 */,
-+  0xc700 /* -7.000000 */,
-+  0x4d00 /* 20.000000 */,
-+  0x7c00 /* inf */,
-+  0xfc00 /* -inf */
-+};
-+
-+#define TEST_MSG "VRNDMH_F16"
-+#define INSN_NAME vrndmh_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndn.c
-@@ -0,0 +1,25 @@
-+/* { dg-require-effective-target arm_v8_neon_hw } */
-+/* { dg-add-options arm_v8_neon } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+/* Expected results.  */
-+/* Expected results.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcc00, 0xcb80,
-+					       0xcb00, 0xca80 };
-+VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xcc00, 0xcb80,
-+					       0xcb00, 0xca80,
-+					       0xca00, 0xc980,
-+					       0xc900, 0xc880 };
-+#endif
-+VECT_VAR_DECL (expected, hfloat, 32, 2) [] = { 0xc1800000, 0xc1700000 };
-+VECT_VAR_DECL (expected, hfloat, 32, 4) [] = { 0xc1800000, 0xc1700000,
-+					       0xc1600000, 0xc1500000 };
-+
-+#define INSN vrndn
-+#define TEST_MSG "VRNDN"
-+
-+#include "vrndX.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndnh_f16_1.c
-@@ -0,0 +1,40 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected[] =
-+{
-+  0x0000 /* 0.000000 */,
-+  0x8000 /* -0.000000 */,
-+  0x4000 /* 2.000000 */,
-+  0x4200 /* 3.000000 */,
-+  0x4d00 /* 20.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0xc000 /* -2.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0xc800 /* -8.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0x4a80 /* 13.000000 */,
-+  0xc600 /* -6.000000 */,
-+  0x4d00 /* 20.000000 */,
-+  0x7c00 /* inf */,
-+  0xfc00 /* -inf */
-+};
-+
-+#define TEST_MSG "VRNDNH_F16"
-+#define INSN_NAME vrndnh_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndp.c
-@@ -0,0 +1,24 @@
-+/* { dg-require-effective-target arm_v8_neon_hw } */
-+/* { dg-add-options arm_v8_neon } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+/* Expected results.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcc00, 0xcb80,
-+					       0xcb00, 0xca80 };
-+VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xcc00, 0xcb80,
-+					       0xcb00, 0xca80,
-+					       0xca00, 0xc980,
-+					       0xc900, 0xc880 };
-+#endif
-+VECT_VAR_DECL (expected, hfloat, 32, 2) [] = { 0xc1800000, 0xc1700000 };
-+VECT_VAR_DECL (expected, hfloat, 32, 4) [] = { 0xc1800000, 0xc1700000,
-+					       0xc1600000, 0xc1500000 };
-+
-+#define INSN vrndp
-+#define TEST_MSG "VRNDP"
-+
-+#include "vrndX.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndph_f16_1.c
-@@ -0,0 +1,40 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected[] =
-+{
-+  0x0000 /* 0.000000 */,
-+  0x8000 /* -0.000000 */,
-+  0x4000 /* 2.000000 */,
-+  0x4400 /* 4.000000 */,
-+  0x4d00 /* 20.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0xc000 /* -2.000000 */,
-+  0x4000 /* 2.000000 */,
-+  0xc700 /* -7.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0x4b00 /* 14.000000 */,
-+  0xc600 /* -6.000000 */,
-+  0x4d00 /* 20.000000 */,
-+  0x7c00 /* inf */,
-+  0xfc00 /* -inf */
-+};
-+
-+#define TEST_MSG "VRNDPH_F16"
-+#define INSN_NAME vrndph_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndx.c
-@@ -0,0 +1,24 @@
-+/* { dg-require-effective-target arm_v8_neon_hw } */
-+/* { dg-add-options arm_v8_neon } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+/* Expected results.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcc00, 0xcb80,
-+					       0xcb00, 0xca80 };
-+VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xcc00, 0xcb80,
-+					       0xcb00, 0xca80,
-+					       0xca00, 0xc980,
-+					       0xc900, 0xc880 };
-+#endif
-+VECT_VAR_DECL (expected, hfloat, 32, 2) [] = { 0xc1800000, 0xc1700000 };
-+VECT_VAR_DECL (expected, hfloat, 32, 4) [] = { 0xc1800000, 0xc1700000,
-+					       0xc1600000, 0xc1500000 };
-+
-+#define INSN vrndx
-+#define TEST_MSG "VRNDX"
-+
-+#include "vrndX.inc"
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrndxh_f16_1.c
-@@ -0,0 +1,40 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected[] =
-+{
-+  0x0000 /* 0.000000 */,
-+  0x8000 /* -0.000000 */,
-+  0x4000 /* 2.000000 */,
-+  0x4200 /* 3.000000 */,
-+  0x4d00 /* 20.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0xc000 /* -2.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0xc800 /* -8.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x0000 /* 0.000000 */,
-+  0x3c00 /* 1.000000 */,
-+  0x4a80 /* 13.000000 */,
-+  0xc600 /* -6.000000 */,
-+  0x4d00 /* 20.000000 */,
-+  0x7c00 /* inf */,
-+  0xfc00 /* -inf */
-+};
-+
-+#define TEST_MSG "VRNDNH_F16"
-+#define INSN_NAME vrndnh_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrsqrte.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrsqrte.c
-@@ -7,6 +7,11 @@
- VECT_VAR_DECL(expected,uint,32,2) [] = { 0xffffffff, 0xffffffff };
- VECT_VAR_DECL(expected,uint,32,4) [] = { 0x9c800000, 0x9c800000,
- 					 0x9c800000, 0x9c800000 };
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0x324c, 0x324c, 0x324c, 0x324c };
-+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0x3380, 0x3380, 0x3380, 0x3380,
-+					      0x3380, 0x3380, 0x3380, 0x3380 };
-+#endif
- VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x3e498000, 0x3e498000 };
- VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x3e700000, 0x3e700000,
- 					   0x3e700000, 0x3e700000 };
-@@ -22,17 +27,39 @@ VECT_VAR_DECL(expected_2,uint,32,4) [] = { 0xed000000, 0xed000000,
- 					   0xed000000, 0xed000000 };
- 
- /* Expected results with FP special inputs values (NaNs, ...).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_fp1, hfloat, 16, 4) [] = { 0x7e00, 0x7e00,
-+						  0x7e00, 0x7e00 };
-+VECT_VAR_DECL(expected_fp1, hfloat, 16, 8) [] = { 0x7c00, 0x7c00,
-+						  0x7c00, 0x7c00,
-+						  0x7c00, 0x7c00,
-+						  0x7c00, 0x7c00 };
-+#endif
- VECT_VAR_DECL(expected_fp1,hfloat,32,2) [] = { 0x7fc00000, 0x7fc00000 };
- VECT_VAR_DECL(expected_fp1,hfloat,32,4) [] = { 0x7f800000, 0x7f800000,
- 					       0x7f800000, 0x7f800000 };
- 
- /* Expected results with FP special inputs values
-    (negative, infinity).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_fp2, hfloat, 16, 4) [] = { 0x7e00, 0x7e00,
-+						  0x7e00, 0x7e00 };
-+VECT_VAR_DECL(expected_fp2, hfloat, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0, 0x0,
-+						  0x0, 0x0, 0x0 };
-+#endif
- VECT_VAR_DECL(expected_fp2,hfloat,32,2) [] = { 0x7fc00000, 0x7fc00000 };
- VECT_VAR_DECL(expected_fp2,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
- 
- /* Expected results with FP special inputs values
-    (-0, -infinity).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_fp3, hfloat, 16, 4) [] = { 0xfc00, 0xfc00,
-+						  0xfc00, 0xfc00 };
-+VECT_VAR_DECL(expected_fp3, hfloat, 16, 8) [] = { 0x7e00, 0x7e00,
-+						  0x7e00, 0x7e00,
-+						  0x7e00, 0x7e00,
-+						  0x7e00, 0x7e00 };
-+#endif
- VECT_VAR_DECL(expected_fp3,hfloat,32,2) [] = { 0xff800000, 0xff800000 };
- VECT_VAR_DECL(expected_fp3,hfloat,32,4) [] = { 0x7fc00000, 0x7fc00000,
- 					       0x7fc00000, 0x7fc00000 };
-@@ -50,32 +77,60 @@ void exec_vrsqrte(void)
- 		    VECT_VAR(vector_res, T1, W, N))
- 
-   DECL_VARIABLE(vector, uint, 32, 2);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector, float, 16, 4);
-+#endif
-   DECL_VARIABLE(vector, float, 32, 2);
-   DECL_VARIABLE(vector, uint, 32, 4);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector, float, 16, 8);
-+#endif
-   DECL_VARIABLE(vector, float, 32, 4);
- 
-   DECL_VARIABLE(vector_res, uint, 32, 2);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector_res, float, 16, 4);
-+#endif
-   DECL_VARIABLE(vector_res, float, 32, 2);
-   DECL_VARIABLE(vector_res, uint, 32, 4);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector_res, float, 16, 8);
-+#endif
-   DECL_VARIABLE(vector_res, float, 32, 4);
- 
-   clean_results ();
- 
-   /* Choose init value arbitrarily.  */
-   VDUP(vector, , uint, u, 32, 2, 0x12345678);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, , float, f, 16, 4, 25.799999f);
-+#endif
-   VDUP(vector, , float, f, 32, 2, 25.799999f);
-   VDUP(vector, q, uint, u, 32, 4, 0xABCDEF10);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, q, float, f, 16, 8, 18.2f);
-+#endif
-   VDUP(vector, q, float, f, 32, 4, 18.2f);
- 
-   /* Apply the operator.  */
-   TEST_VRSQRTE(, uint, u, 32, 2);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VRSQRTE(, float, f, 16, 4);
-+#endif
-   TEST_VRSQRTE(, float, f, 32, 2);
-   TEST_VRSQRTE(q, uint, u, 32, 4);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VRSQRTE(q, float, f, 16, 8);
-+#endif
-   TEST_VRSQRTE(q, float, f, 32, 4);
- 
- #define CMT ""
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, CMT);
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, CMT);
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, CMT);
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected, CMT);
- 
-@@ -110,42 +165,78 @@ void exec_vrsqrte(void)
- 
- 
-   /* Test FP variants with special input values (NaNs, ...).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, , float, f, 16, 4, NAN);
-+  VDUP(vector, q, float, f, 16, 8, 0.0f);
-+#endif
-   VDUP(vector, , float, f, 32, 2, NAN);
-   VDUP(vector, q, float, f, 32, 4, 0.0f);
- 
-   /* Apply the operator.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VRSQRTE(, float, f, 16, 4);
-+  TEST_VRSQRTE(q, float, f, 16, 8);
-+#endif
-   TEST_VRSQRTE(, float, f, 32, 2);
-   TEST_VRSQRTE(q, float, f, 32, 4);
- 
- #undef CMT
- #define CMT " FP special (NaN, 0)"
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_fp1, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_fp1, CMT);
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_fp1, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_fp1, CMT);
- 
- 
-   /* Test FP variants with special input values (negative, infinity).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, , float, f, 16, 4, -1.0f);
-+  VDUP(vector, q, float, f, 16, 8, HUGE_VALF);
-+#endif
-   VDUP(vector, , float, f, 32, 2, -1.0f);
-   VDUP(vector, q, float, f, 32, 4, HUGE_VALF);
- 
-   /* Apply the operator.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VRSQRTE(, float, f, 16, 4);
-+  TEST_VRSQRTE(q, float, f, 16, 8);
-+#endif
-   TEST_VRSQRTE(, float, f, 32, 2);
-   TEST_VRSQRTE(q, float, f, 32, 4);
- 
- #undef CMT
- #define CMT " FP special (negative, infinity)"
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_fp2, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_fp2, CMT);
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_fp2, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_fp2, CMT);
- 
-   /* Test FP variants with special input values (-0, -infinity).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, , float, f, 16, 4, -0.0f);
-+  VDUP(vector, q, float, f, 16, 8, -HUGE_VALF);
-+#endif
-   VDUP(vector, , float, f, 32, 2, -0.0f);
-   VDUP(vector, q, float, f, 32, 4, -HUGE_VALF);
- 
-   /* Apply the operator.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VRSQRTE(, float, f, 16, 4);
-+  TEST_VRSQRTE(q, float, f, 16, 8);
-+#endif
-   TEST_VRSQRTE(, float, f, 32, 2);
-   TEST_VRSQRTE(q, float, f, 32, 4);
- 
- #undef CMT
- #define CMT " FP special (-0, -infinity)"
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_fp3, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_fp3, CMT);
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_fp3, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_fp3, CMT);
- }
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrsqrteh_f16_1.c
-@@ -0,0 +1,30 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+float16_t input[] = { 123.4, 67.8, 34.8, 24.0, 66.1, 144.0, 4.8, 77.0 };
-+uint16_t expected[] = { 0x2DC4 /* FP16_C (1/__builtin_sqrtf (123.4)).  */,
-+			0x2FC8 /* FP16_C (1/__builtin_sqrtf (67.8)).  */,
-+			0x316C /* FP16_C (1/__builtin_sqrtf (34.8)).  */,
-+			0x3288 /* FP16_C (1/__builtin_sqrtf (24.0)).  */,
-+			0x2FDC /* FP16_C (1/__builtin_sqrtf (66.1)).  */,
-+			0x2D54 /* FP16_C (1/__builtin_sqrtf (144.0)).  */,
-+			0x3750 /* FP16_C (1/__builtin_sqrtf (4.8)).  */,
-+			0x2F48 /* FP16_C (1/__builtin_sqrtf (77.0)).  */ };
-+
-+#define TEST_MSG "VRSQRTEH_F16"
-+#define INSN_NAME vrsqrteh_f16
-+
-+#define INPUT input
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrsqrts.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrsqrts.c
-@@ -4,22 +4,51 @@
- #include <math.h>
- 
- /* Expected results.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected, hfloat, 16, 4) [] = { 0xd3cb, 0xd3cb, 0xd3cb, 0xd3cb };
-+VECT_VAR_DECL(expected, hfloat, 16, 8) [] = { 0xc726, 0xc726, 0xc726, 0xc726,
-+					      0xc726, 0xc726, 0xc726, 0xc726 };
-+#endif
- VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc2796b84, 0xc2796b84 };
- VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc0e4a3d8, 0xc0e4a3d8,
- 					   0xc0e4a3d8, 0xc0e4a3d8 };
- 
- /* Expected results with input=NaN.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_nan, hfloat, 16, 4) [] = { 0x7e00, 0x7e00,
-+						  0x7e00, 0x7e00 };
-+VECT_VAR_DECL(expected_nan, hfloat, 16, 8) [] = { 0x7e00, 0x7e00,
-+						  0x7e00, 0x7e00,
-+						  0x7e00, 0x7e00,
-+						  0x7e00, 0x7e00 };
-+#endif
- VECT_VAR_DECL(expected_nan,hfloat,32,2) [] = { 0x7fc00000, 0x7fc00000 };
- VECT_VAR_DECL(expected_nan,hfloat,32,4) [] = { 0x7fc00000, 0x7fc00000,
- 					       0x7fc00000, 0x7fc00000 };
- 
- /* Expected results with FP special inputs values (infinity, 0).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_fp1, hfloat, 16, 4) [] = { 0xfc00, 0xfc00,
-+						  0xfc00, 0xfc00 };
-+VECT_VAR_DECL(expected_fp1, hfloat, 16, 8) [] = { 0x3e00, 0x3e00,
-+						  0x3e00, 0x3e00,
-+						  0x3e00, 0x3e00,
-+						  0x3e00, 0x3e00 };
-+#endif
- VECT_VAR_DECL(expected_fp1,hfloat,32,2) [] = { 0xff800000, 0xff800000 };
- VECT_VAR_DECL(expected_fp1,hfloat,32,4) [] = { 0x3fc00000, 0x3fc00000,
- 					       0x3fc00000, 0x3fc00000 };
- 
- /* Expected results with only FP special inputs values (infinity,
-    0).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_fp2, hfloat, 16, 4) [] = { 0x3e00, 0x3e00,
-+						  0x3e00, 0x3e00 };
-+VECT_VAR_DECL(expected_fp2, hfloat, 16, 8) [] = { 0x3e00, 0x3e00,
-+						  0x3e00, 0x3e00,
-+						  0x3e00, 0x3e00,
-+						  0x3e00, 0x3e00 };
-+#endif
- VECT_VAR_DECL(expected_fp2,hfloat,32,2) [] = { 0x3fc00000, 0x3fc00000 };
- VECT_VAR_DECL(expected_fp2,hfloat,32,4) [] = { 0x3fc00000, 0x3fc00000,
- 					       0x3fc00000, 0x3fc00000 };
-@@ -38,75 +67,143 @@ void exec_vrsqrts(void)
- 		    VECT_VAR(vector_res, T1, W, N))
- 
-   /* No need for integer variants.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector, float, 16, 4);
-+  DECL_VARIABLE(vector, float, 16, 8);
-+#endif
-   DECL_VARIABLE(vector, float, 32, 2);
-   DECL_VARIABLE(vector, float, 32, 4);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector2, float, 16, 4);
-+  DECL_VARIABLE(vector2, float, 16, 8);
-+#endif
-   DECL_VARIABLE(vector2, float, 32, 2);
-   DECL_VARIABLE(vector2, float, 32, 4);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector_res, float, 16, 4);
-+  DECL_VARIABLE(vector_res, float, 16, 8);
-+#endif
-   DECL_VARIABLE(vector_res, float, 32, 2);
-   DECL_VARIABLE(vector_res, float, 32, 4);
- 
-   clean_results ();
- 
-   /* Choose init value arbitrarily.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, , float, f, 16, 4, 12.9f);
-+  VDUP(vector, q, float, f, 16, 8, 9.1f);
-+#endif
-   VDUP(vector, , float, f, 32, 2, 12.9f);
-   VDUP(vector, q, float, f, 32, 4, 9.1f);
- 
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector2, , float, f, 16, 4, 9.9f);
-+  VDUP(vector2, q, float, f, 16, 8, 1.9f);
-+#endif
-   VDUP(vector2, , float, f, 32, 2, 9.9f);
-   VDUP(vector2, q, float, f, 32, 4, 1.9f);
- 
-   /* Apply the operator.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VRSQRTS(, float, f, 16, 4);
-+  TEST_VRSQRTS(q, float, f, 16, 8);
-+#endif
-   TEST_VRSQRTS(, float, f, 32, 2);
-   TEST_VRSQRTS(q, float, f, 32, 4);
- 
- #define CMT ""
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, CMT);
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected, CMT);
- 
- 
-   /* Test FP variants with special input values (NaN).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, , float, f, 16, 4, NAN);
-+  VDUP(vector2, q, float, f, 16, 8, NAN);
-+#endif
-   VDUP(vector, , float, f, 32, 2, NAN);
-   VDUP(vector2, q, float, f, 32, 4, NAN);
- 
-   /* Apply the operator.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VRSQRTS(, float, f, 16, 4);
-+  TEST_VRSQRTS(q, float, f, 16, 8);
-+#endif
-   TEST_VRSQRTS(, float, f, 32, 2);
-   TEST_VRSQRTS(q, float, f, 32, 4);
- 
- #undef CMT
- #define CMT " FP special (NAN) and normal values"
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_nan, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_nan, CMT);
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_nan, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_nan, CMT);
- 
- 
-   /* Test FP variants with special input values (infinity, 0).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, , float, f, 16, 4, HUGE_VALF);
-+  VDUP(vector, q, float, f, 16, 8, 0.0f);
-+  /* Restore a normal value in vector2.  */
-+  VDUP(vector2, q, float, f, 16, 8, 3.2f);
-+#endif
-   VDUP(vector, , float, f, 32, 2, HUGE_VALF);
-   VDUP(vector, q, float, f, 32, 4, 0.0f);
-   /* Restore a normal value in vector2.  */
-   VDUP(vector2, q, float, f, 32, 4, 3.2f);
- 
-   /* Apply the operator.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VRSQRTS(, float, f, 16, 4);
-+  TEST_VRSQRTS(q, float, f, 16, 8);
-+#endif
-   TEST_VRSQRTS(, float, f, 32, 2);
-   TEST_VRSQRTS(q, float, f, 32, 4);
- 
- #undef CMT
- #define CMT " FP special (infinity, 0) and normal values"
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_fp1, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_fp1, CMT);
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_fp1, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_fp1, CMT);
- 
- 
-   /* Test FP variants with only special input values (infinity, 0).  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  VDUP(vector, , float, f, 16, 4, HUGE_VALF);
-+  VDUP(vector, q, float, f, 16, 8, 0.0f);
-+  VDUP(vector2, , float, f, 16, 4, -0.0f);
-+  VDUP(vector2, q, float, f, 16, 8, HUGE_VALF);
-+#endif
-   VDUP(vector, , float, f, 32, 2, HUGE_VALF);
-   VDUP(vector, q, float, f, 32, 4, 0.0f);
-   VDUP(vector2, , float, f, 32, 2, -0.0f);
-   VDUP(vector2, q, float, f, 32, 4, HUGE_VALF);
- 
-   /* Apply the operator.  */
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  TEST_VRSQRTS(, float, f, 16, 4);
-+  TEST_VRSQRTS(q, float, f, 16, 8);
-+#endif
-   TEST_VRSQRTS(, float, f, 32, 2);
-   TEST_VRSQRTS(q, float, f, 32, 4);
- 
- #undef CMT
- #define CMT " only FP special (infinity, 0)"
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_fp2, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_fp2, CMT);
-+#endif
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_fp2, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_fp2, CMT);
- }
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrsqrtsh_f16_1.c
-@@ -0,0 +1,50 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_fp16.h>
-+
-+/* Input values.  */
-+#define A 12.4
-+#define B -5.8
-+#define C -3.8
-+#define D 10
-+#define E 66.1
-+#define F 16.1
-+#define G -4.8
-+#define H -77
-+
-+#define I 0.7
-+#define J -78
-+#define K 10.23
-+#define L 98
-+#define M 87
-+#define N -87.81
-+#define O -1.1
-+#define P 47.8
-+
-+float16_t input_1[] = { A, B, C, D, I, J, K, L };
-+float16_t input_2[] = { E, F, G, H, M, N, O, P };
-+uint16_t expected[] = { 0xDE62 /* (3.0f + (-A) * E) / 2.0f.  */,
-+			0x5206 /* (3.0f + (-B) * F) / 2.0f.  */,
-+			0xC7A0 /* (3.0f + (-C) * G) / 2.0f.  */,
-+			0x5E0A /* (3.0f + (-D) * H) / 2.0f.  */,
-+			0xCF3D /* (3.0f + (-I) * M) / 2.0f.  */,
-+			0xEAB0 /* (3.0f + (-J) * N) / 2.0f.  */,
-+			0x471F /* (3.0f + (-K) * O) / 2.0f.  */,
-+			0xE893 /* (3.0f + (-L) * P) / 2.0f.  */ };
-+
-+#define TEST_MSG "VRSQRTSH_F16"
-+#define INSN_NAME vrsqrtsh_f16
-+
-+#define INPUT_1 input_1
-+#define INPUT_2 input_2
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "binary_scalar_op.inc"
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsXi_n.inc
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsXi_n.inc
-@@ -76,16 +76,16 @@ void FNNAME (INSN_NAME) (void)
-   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected, "");
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected, "");
-   CHECK(TEST_MSG, uint, 64, 1, PRIx64, expected, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
--  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
-   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected, "");
-   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
-   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
-   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected, "");
-   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
--  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
--  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
-+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected, "");
- 
- #ifdef EXTRA_TESTS
-   EXTRA_TESTS();
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vshl.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vshl.c
-@@ -101,10 +101,8 @@ VECT_VAR_DECL(expected_negative_shift,uint,64,2) [] = { 0x7ffffffffffffff,
- 							0x7ffffffffffffff };
- 
- 
--#ifndef INSN_NAME
- #define INSN_NAME vshl
- #define TEST_MSG "VSHL/VSHLQ"
--#endif
- 
- #define FNNAME1(NAME) exec_ ## NAME
- #define FNNAME(NAME) FNNAME1(NAME)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vshuffle.inc
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vshuffle.inc
-@@ -53,9 +53,17 @@ void FNNAME (INSN_NAME) (void)
-   DECL_VSHUFFLE(float, 32, 4)
- 
-   DECL_ALL_VSHUFFLE();
-+#if defined (FP16_SUPPORTED)
-+  DECL_VSHUFFLE (float, 16, 4);
-+  DECL_VSHUFFLE (float, 16, 8);
-+#endif
- 
-   /* Initialize input "vector" from "buffer".  */
-   TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector1, buffer);
-+#if defined (FP16_SUPPORTED)
-+  VLOAD (vector1, buffer, , float, f, 16, 4);
-+  VLOAD (vector1, buffer, q, float, f, 16, 8);
-+#endif
-   VLOAD(vector1, buffer, , float, f, 32, 2);
-   VLOAD(vector1, buffer, q, float, f, 32, 4);
- 
-@@ -68,6 +76,9 @@ void FNNAME (INSN_NAME) (void)
-   VDUP(vector2, , uint, u, 32, 2, 0x77);
-   VDUP(vector2, , poly, p, 8, 8, 0x55);
-   VDUP(vector2, , poly, p, 16, 4, 0x66);
-+#if defined (FP16_SUPPORTED)
-+  VDUP (vector2, , float, f, 16, 4, 14.6f);   /* 14.6f is 0x4b4d.  */
-+#endif
-   VDUP(vector2, , float, f, 32, 2, 33.6f);
- 
-   VDUP(vector2, q, int, s, 8, 16, 0x11);
-@@ -78,8 +89,11 @@ void FNNAME (INSN_NAME) (void)
-   VDUP(vector2, q, uint, u, 32, 4, 0x77);
-   VDUP(vector2, q, poly, p, 8, 16, 0x55);
-   VDUP(vector2, q, poly, p, 16, 8, 0x66);
-+#if defined (FP16_SUPPORTED)
-+  VDUP (vector2, q, float, f, 16, 8, 14.6f);
-+#endif
-   VDUP(vector2, q, float, f, 32, 4, 33.8f);
--  
-+
- #define TEST_ALL_VSHUFFLE(INSN)				\
-   TEST_VSHUFFLE(INSN, , int, s, 8, 8);			\
-   TEST_VSHUFFLE(INSN, , int, s, 16, 4);			\
-@@ -100,6 +114,10 @@ void FNNAME (INSN_NAME) (void)
-   TEST_VSHUFFLE(INSN, q, poly, p, 16, 8);		\
-   TEST_VSHUFFLE(INSN, q, float, f, 32, 4)
- 
-+#define TEST_VSHUFFLE_FP16(INSN)		\
-+  TEST_VSHUFFLE(INSN, , float, f, 16, 4);	\
-+  TEST_VSHUFFLE(INSN, q, float, f, 16, 8);
-+
- #define TEST_ALL_EXTRA_CHUNKS()			\
-   TEST_EXTRA_CHUNK(int, 8, 8, 1);		\
-   TEST_EXTRA_CHUNK(int, 16, 4, 1);		\
-@@ -130,8 +148,8 @@ void FNNAME (INSN_NAME) (void)
-     CHECK(test_name, uint, 8, 8, PRIx8, EXPECTED, comment);		\
-     CHECK(test_name, uint, 16, 4, PRIx16, EXPECTED, comment);		\
-     CHECK(test_name, uint, 32, 2, PRIx32, EXPECTED, comment);		\
--    CHECK(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);		\
--    CHECK(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);		\
-+    CHECK_POLY(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);		\
-+    CHECK_POLY(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);		\
-     CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment);	\
- 									\
-     CHECK(test_name, int, 8, 16, PRIx8, EXPECTED, comment);		\
-@@ -140,20 +158,40 @@ void FNNAME (INSN_NAME) (void)
-     CHECK(test_name, uint, 8, 16, PRIx8, EXPECTED, comment);		\
-     CHECK(test_name, uint, 16, 8, PRIx16, EXPECTED, comment);		\
-     CHECK(test_name, uint, 32, 4, PRIx32, EXPECTED, comment);		\
--    CHECK(test_name, poly, 8, 16, PRIx8, EXPECTED, comment);		\
--    CHECK(test_name, poly, 16, 8, PRIx16, EXPECTED, comment);		\
-+    CHECK_POLY(test_name, poly, 8, 16, PRIx8, EXPECTED, comment);		\
-+    CHECK_POLY(test_name, poly, 16, 8, PRIx16, EXPECTED, comment);		\
-     CHECK_FP(test_name, float, 32, 4, PRIx32, EXPECTED, comment);	\
--  }									\
-+  }
-+
-+#define CHECK_RESULTS_VSHUFFLE_FP16(test_name,EXPECTED,comment)		\
-+  {									\
-+    CHECK_FP (test_name, float, 16, 4, PRIx16, EXPECTED, comment);	\
-+    CHECK_FP (test_name, float, 16, 8, PRIx16, EXPECTED, comment);	\
-+  }
- 
-   clean_results ();
- 
-   /* Execute the tests.  */
-   TEST_ALL_VSHUFFLE(INSN_NAME);
-+#if defined (FP16_SUPPORTED)
-+  TEST_VSHUFFLE_FP16 (INSN_NAME);
-+#endif
- 
-   CHECK_RESULTS_VSHUFFLE (TEST_MSG, expected0, "(chunk 0)");
-+#if defined (FP16_SUPPORTED)
-+  CHECK_RESULTS_VSHUFFLE_FP16 (TEST_MSG, expected0, "(chunk 0)");
-+#endif
- 
-   TEST_ALL_EXTRA_CHUNKS();
-+#if defined (FP16_SUPPORTED)
-+  TEST_EXTRA_CHUNK (float, 16, 4, 1);
-+  TEST_EXTRA_CHUNK (float, 16, 8, 1);
-+#endif
-+
-   CHECK_RESULTS_VSHUFFLE (TEST_MSG, expected1, "(chunk 1)");
-+#if defined (FP16_SUPPORTED)
-+  CHECK_RESULTS_VSHUFFLE_FP16 (TEST_MSG, expected1, "(chunk 1)");
-+#endif
- }
- 
- int main (void)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsli_n.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsli_n.c
-@@ -161,14 +161,16 @@ void vsli_extra(void)
-   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_max_shift, COMMENT);
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_max_shift, COMMENT);
-   CHECK(TEST_MSG, uint, 64, 1, PRIx64, expected_max_shift, COMMENT);
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_max_shift, COMMENT);
--  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_max_shift, COMMENT);
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_max_shift, COMMENT);
-+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_max_shift, COMMENT);
-   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_max_shift, COMMENT);
-   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_max_shift, COMMENT);
-   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_max_shift, COMMENT);
-+  CHECK(TEST_MSG, int, 64, 2, PRIx64, expected_max_shift, COMMENT);
-   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_max_shift, COMMENT);
-   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_max_shift, COMMENT);
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_max_shift, COMMENT);
--  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_max_shift, COMMENT);
--  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_max_shift, COMMENT);
-+  CHECK(TEST_MSG, uint, 64, 2, PRIx64, expected_max_shift, COMMENT);
-+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_max_shift, COMMENT);
-+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_max_shift, COMMENT);
- }
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsqrt_f16_1.c
-@@ -0,0 +1,72 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_neon } */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+#define FP16_C(a) ((__fp16) a)
-+#define A FP16_C (123.4)
-+#define B FP16_C (567.8)
-+#define C FP16_C (34.8)
-+#define D FP16_C (1024)
-+#define E FP16_C (663.1)
-+#define F FP16_C (144.0)
-+#define G FP16_C (4.8)
-+#define H FP16_C (77)
-+
-+#define SQRT_A 0x498E /* FP16_C (__builtin_sqrtf (123.4)).  */
-+#define SQRT_B 0x4DF5 /* FP16_C (__builtin_sqrtf (567.8)).  */
-+#define SQRT_C 0x45E6 /* FP16_C (__builtin_sqrtf (34.8)).  */
-+#define SQRT_D 0x5000 /* FP16_C (__builtin_sqrtf (1024)).  */
-+#define SQRT_E 0x4E70 /* FP16_C (__builtin_sqrtf (663.1)).  */
-+#define SQRT_F 0x4A00 /* FP16_C (__builtin_sqrtf (144.0)).  */
-+#define SQRT_G 0x4062 /* FP16_C (__builtin_sqrtf (4.8)).  */
-+#define SQRT_H 0x4863 /* FP16_C (__builtin_sqrtf (77)).  */
-+
-+/* Expected results for vsqrt.  */
-+VECT_VAR_DECL (expected_static, hfloat, 16, 4) []
-+  = { SQRT_A, SQRT_B, SQRT_C, SQRT_D };
-+
-+VECT_VAR_DECL (expected_static, hfloat, 16, 8) []
-+  = { SQRT_A, SQRT_B, SQRT_C, SQRT_D, SQRT_E, SQRT_F, SQRT_G, SQRT_H };
-+
-+void exec_vsqrt_f16 (void)
-+{
-+#undef TEST_MSG
-+#define TEST_MSG "VSQRT (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc, float, 16, 4);
-+  VECT_VAR_DECL (buf_src, float, 16, 4) [] = {A, B, C, D};
-+  VLOAD (vsrc, buf_src, , float, f, 16, 4);
-+  DECL_VARIABLE (vector_res, float, 16, 4)
-+    = vsqrt_f16 (VECT_VAR (vsrc, float, 16, 4));
-+  vst1_f16 (VECT_VAR (result, float, 16, 4),
-+	    VECT_VAR (vector_res, float, 16, 4));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 4, PRIx16, expected_static, "");
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VSQRTQ (FP16)"
-+  clean_results ();
-+
-+  DECL_VARIABLE(vsrc, float, 16, 8);
-+  VECT_VAR_DECL (buf_src, float, 16, 8) [] = {A, B, C, D, E, F, G, H};
-+  VLOAD (vsrc, buf_src, q, float, f, 16, 8);
-+  DECL_VARIABLE (vector_res, float, 16, 8)
-+    = vsqrtq_f16 (VECT_VAR (vsrc, float, 16, 8));
-+  vst1q_f16 (VECT_VAR (result, float, 16, 8),
-+	     VECT_VAR (vector_res, float, 16, 8));
-+
-+  CHECK_FP (TEST_MSG, float, 16, 8, PRIx16, expected_static, "");
-+}
-+
-+int
-+main (void)
-+{
-+  exec_vsqrt_f16 ();
-+  return 0;
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsqrth_f16_1.c
-@@ -0,0 +1,40 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected[] =
-+{
-+  0x0000 /* 0.000000 */,
-+  0x8000 /* -0.000000 */,
-+  0x3da8 /* 1.414062 */,
-+  0x3f0b /* 1.760742 */,
-+  0x4479 /* 4.472656 */,
-+  0x390f /* 0.632324 */,
-+  0x7e00 /* nan */,
-+  0x3c9d /* 1.153320 */,
-+  0x7e00 /* nan */,
-+  0x3874 /* 0.556641 */,
-+  0x38a2 /* 0.579102 */,
-+  0x39a8 /* 0.707031 */,
-+  0x3c00 /* 1.000000 */,
-+  0x433f /* 3.623047 */,
-+  0x7e00 /* nan */,
-+  0x4479 /* 4.472656 */,
-+  0x7c00 /* inf */,
-+  0x7e00 /* nan */
-+};
-+
-+#define TEST_MSG "VSQRTH_F16"
-+#define INSN_NAME vsqrth_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for unary scalar operations.  */
-+#include "unary_scalar_op.inc"
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsri_n.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsri_n.c
-@@ -163,14 +163,14 @@ void vsri_extra(void)
-   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_max_shift, COMMENT);
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_max_shift, COMMENT);
-   CHECK(TEST_MSG, uint, 64, 1, PRIx64, expected_max_shift, COMMENT);
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_max_shift, COMMENT);
--  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_max_shift, COMMENT);
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_max_shift, COMMENT);
-+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_max_shift, COMMENT);
-   CHECK(TEST_MSG, int, 8, 16, PRIx8, expected_max_shift, COMMENT);
-   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_max_shift, COMMENT);
-   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_max_shift, COMMENT);
-   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_max_shift, COMMENT);
-   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_max_shift, COMMENT);
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_max_shift, COMMENT);
--  CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_max_shift, COMMENT);
--  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_max_shift, COMMENT);
-+  CHECK_POLY(TEST_MSG, poly, 8, 16, PRIx8, expected_max_shift, COMMENT);
-+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_max_shift, COMMENT);
- }
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2_lane_f16_indices_1.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2_lane_f16_indices_1.c
-@@ -2,6 +2,7 @@
- 
- /* { dg-do compile } */
- /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
-+/* { dg-require-effective-target arm_neon_fp16_ok { target { arm*-*-* } } } */
- 
- void
- f_vst2_lane_f16 (float16_t * p, float16x4x2_t v)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2q_lane_f16_indices_1.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2q_lane_f16_indices_1.c
-@@ -2,6 +2,7 @@
- 
- /* { dg-do compile } */
- /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
-+/* { dg-require-effective-target arm_neon_fp16_ok { target { arm*-*-* } } } */
- 
- void
- f_vst2q_lane_f16 (float16_t * p, float16x8x2_t v)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3_lane_f16_indices_1.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3_lane_f16_indices_1.c
-@@ -2,6 +2,7 @@
- 
- /* { dg-do compile } */
- /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
-+/* { dg-require-effective-target arm_neon_fp16_ok { target { arm*-*-* } } } */
- 
- void
- f_vst3_lane_f16 (float16_t * p, float16x4x3_t v)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3q_lane_f16_indices_1.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3q_lane_f16_indices_1.c
-@@ -2,6 +2,7 @@
- 
- /* { dg-do compile } */
- /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
-+/* { dg-require-effective-target arm_neon_fp16_ok { target { arm*-*-* } } } */
- 
- void
- f_vst3q_lane_f16 (float16_t * p, float16x8x3_t v)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4_lane_f16_indices_1.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4_lane_f16_indices_1.c
-@@ -2,6 +2,7 @@
- 
- /* { dg-do compile } */
- /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
-+/* { dg-require-effective-target arm_neon_fp16_ok { target { arm*-*-* } } } */
- 
- void
- f_vst4_lane_f16 (float16_t * p, float16x4x4_t v)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4q_lane_f16_indices_1.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4q_lane_f16_indices_1.c
-@@ -2,6 +2,7 @@
- 
- /* { dg-do compile } */
- /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
-+/* { dg-require-effective-target arm_neon_fp16_ok { target { arm*-*-* } } } */
- 
- void
- f_vst4q_lane_f16 (float16_t * p, float16x8x4_t v)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vstX_lane.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vstX_lane.c
-@@ -14,6 +14,7 @@ VECT_VAR_DECL(expected_st2_0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
- VECT_VAR_DECL(expected_st2_0,poly,8,8) [] = { 0xf0, 0xf1, 0x0, 0x0,
- 					      0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st2_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0x0, 0x0 };
-+VECT_VAR_DECL(expected_st2_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st2_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
- VECT_VAR_DECL(expected_st2_0,int,16,8) [] = { 0xfff0, 0xfff1, 0x0, 0x0,
- 					      0x0, 0x0, 0x0, 0x0 };
-@@ -24,6 +25,8 @@ VECT_VAR_DECL(expected_st2_0,uint,32,4) [] = { 0xfffffff0, 0xfffffff1,
- 					       0x0, 0x0 };
- VECT_VAR_DECL(expected_st2_0,poly,16,8) [] = { 0xfff0, 0xfff1, 0x0, 0x0,
- 					       0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL(expected_st2_0,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0x0, 0x0,
-+						 0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st2_0,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
- 						 0x0, 0x0 };
- 
-@@ -39,6 +42,7 @@ VECT_VAR_DECL(expected_st2_1,uint,32,2) [] = { 0x0, 0x0 };
- VECT_VAR_DECL(expected_st2_1,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
- 					      0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st2_1,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL(expected_st2_1,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st2_1,hfloat,32,2) [] = { 0x0, 0x0 };
- VECT_VAR_DECL(expected_st2_1,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
- 					      0x0, 0x0, 0x0, 0x0 };
-@@ -48,6 +52,8 @@ VECT_VAR_DECL(expected_st2_1,uint,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
- VECT_VAR_DECL(expected_st2_1,uint,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st2_1,poly,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
- 					       0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL(expected_st2_1,hfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
-+						 0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st2_1,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
- 
- /* Expected results for vst3, chunk 0.  */
-@@ -62,6 +68,7 @@ VECT_VAR_DECL(expected_st3_0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
- VECT_VAR_DECL(expected_st3_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0x0,
- 					      0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st3_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0x0 };
-+VECT_VAR_DECL(expected_st3_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0x0 };
- VECT_VAR_DECL(expected_st3_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
- VECT_VAR_DECL(expected_st3_0,int,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0x0,
- 					      0x0, 0x0, 0x0, 0x0 };
-@@ -73,6 +80,8 @@ VECT_VAR_DECL(expected_st3_0,uint,32,4) [] = { 0xfffffff0, 0xfffffff1,
- 					       0xfffffff2, 0x0 };
- VECT_VAR_DECL(expected_st3_0,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0x0,
- 					       0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL(expected_st3_0,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0x0,
-+						 0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st3_0,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
- 						 0xc1600000, 0x0 };
- 
-@@ -88,6 +97,7 @@ VECT_VAR_DECL(expected_st3_1,uint,32,2) [] = { 0xfffffff2, 0x0 };
- VECT_VAR_DECL(expected_st3_1,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
- 					      0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st3_1,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL(expected_st3_1,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st3_1,hfloat,32,2) [] = { 0xc1600000, 0x0 };
- VECT_VAR_DECL(expected_st3_1,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
- 					      0x0, 0x0, 0x0, 0x0 };
-@@ -97,6 +107,8 @@ VECT_VAR_DECL(expected_st3_1,uint,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
- VECT_VAR_DECL(expected_st3_1,uint,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st3_1,poly,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
- 					       0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL(expected_st3_1,hfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
-+						 0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st3_1,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
- 
- /* Expected results for vst3, chunk 2.  */
-@@ -111,6 +123,7 @@ VECT_VAR_DECL(expected_st3_2,uint,32,2) [] = { 0x0, 0x0 };
- VECT_VAR_DECL(expected_st3_2,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
- 					      0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st3_2,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL(expected_st3_2,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st3_2,hfloat,32,2) [] = { 0x0, 0x0 };
- VECT_VAR_DECL(expected_st3_2,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
- 					      0x0, 0x0, 0x0, 0x0 };
-@@ -120,6 +133,8 @@ VECT_VAR_DECL(expected_st3_2,uint,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
- VECT_VAR_DECL(expected_st3_2,uint,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st3_2,poly,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
- 					       0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL(expected_st3_2,hfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
-+						 0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st3_2,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
- 
- /* Expected results for vst4, chunk 0.  */
-@@ -134,6 +149,7 @@ VECT_VAR_DECL(expected_st4_0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
- VECT_VAR_DECL(expected_st4_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
- 					      0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st4_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
-+VECT_VAR_DECL(expected_st4_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
- VECT_VAR_DECL(expected_st4_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
- VECT_VAR_DECL(expected_st4_0,int,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
- 					      0x0, 0x0, 0x0, 0x0 };
-@@ -145,6 +161,8 @@ VECT_VAR_DECL(expected_st4_0,uint,32,4) [] = { 0xfffffff0, 0xfffffff1,
- 					       0xfffffff2, 0xfffffff3 };
- VECT_VAR_DECL(expected_st4_0,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
- 					       0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL(expected_st4_0,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
-+						 0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st4_0,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
- 						 0xc1600000, 0xc1500000 };
- 
-@@ -160,6 +178,7 @@ VECT_VAR_DECL(expected_st4_1,uint,32,2) [] = { 0xfffffff2, 0xfffffff3 };
- VECT_VAR_DECL(expected_st4_1,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
- 					      0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st4_1,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL(expected_st4_1,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st4_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
- VECT_VAR_DECL(expected_st4_1,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
- 					      0x0, 0x0, 0x0, 0x0 };
-@@ -169,6 +188,8 @@ VECT_VAR_DECL(expected_st4_1,uint,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
- VECT_VAR_DECL(expected_st4_1,uint,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st4_1,poly,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
- 					       0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL(expected_st4_1,hfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
-+						 0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st4_1,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
- 
- /* Expected results for vst4, chunk 2.  */
-@@ -183,6 +204,7 @@ VECT_VAR_DECL(expected_st4_2,uint,32,2) [] = { 0x0, 0x0 };
- VECT_VAR_DECL(expected_st4_2,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
- 					      0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st4_2,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL(expected_st4_2,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st4_2,hfloat,32,2) [] = { 0x0, 0x0 };
- VECT_VAR_DECL(expected_st4_2,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
- 					      0x0, 0x0, 0x0, 0x0 };
-@@ -192,6 +214,8 @@ VECT_VAR_DECL(expected_st4_2,uint,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
- VECT_VAR_DECL(expected_st4_2,uint,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st4_2,poly,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
- 					       0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL(expected_st4_2,hfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
-+					       0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st4_2,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
- 
- /* Expected results for vst4, chunk 3.  */
-@@ -206,6 +230,7 @@ VECT_VAR_DECL(expected_st4_3,uint,32,2) [] = { 0x0, 0x0 };
- VECT_VAR_DECL(expected_st4_3,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
- 					      0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st4_3,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL(expected_st4_3,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st4_3,hfloat,32,2) [] = { 0x0, 0x0 };
- VECT_VAR_DECL(expected_st4_3,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
- 					      0x0, 0x0, 0x0, 0x0 };
-@@ -215,6 +240,8 @@ VECT_VAR_DECL(expected_st4_3,uint,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
- VECT_VAR_DECL(expected_st4_3,uint,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st4_3,poly,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
- 					       0x0, 0x0, 0x0, 0x0 };
-+VECT_VAR_DECL(expected_st4_3,hfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
-+						 0x0, 0x0, 0x0, 0x0 };
- VECT_VAR_DECL(expected_st4_3,hfloat,32,4) [] = { 0x0, 0x0, 0x0, 0x0 };
- 
- /* Declare additional input buffers as needed.  */
-@@ -229,6 +256,9 @@ VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 32, 2);
- VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 64, 2);
- VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 8, 2);
- VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 16, 2);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+VECT_VAR_DECL_INIT(buffer_vld2_lane, float, 16, 2);
-+#endif
- VECT_VAR_DECL_INIT(buffer_vld2_lane, float, 32, 2);
- 
- /* Input buffers for vld3_lane.  */
-@@ -242,6 +272,9 @@ VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 32, 3);
- VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 64, 3);
- VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 8, 3);
- VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 16, 3);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+VECT_VAR_DECL_INIT(buffer_vld3_lane, float, 16, 3);
-+#endif
- VECT_VAR_DECL_INIT(buffer_vld3_lane, float, 32, 3);
- 
- /* Input buffers for vld4_lane.  */
-@@ -255,6 +288,9 @@ VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 32, 4);
- VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 64, 4);
- VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 8, 4);
- VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 16, 4);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+VECT_VAR_DECL_INIT(buffer_vld4_lane, float, 16, 4);
-+#endif
- VECT_VAR_DECL_INIT(buffer_vld4_lane, float, 32, 4);
- 
- void exec_vstX_lane (void)
-@@ -302,7 +338,7 @@ void exec_vstX_lane (void)
- 
-   /* We need all variants in 64 bits, but there is no 64x2 variant,
-      nor 128 bits vectors of int8/uint8/poly8.  */
--#define DECL_ALL_VSTX_LANE(X)			\
-+#define DECL_ALL_VSTX_LANE_NO_FP16(X)		\
-   DECL_VSTX_LANE(int, 8, 8, X);			\
-   DECL_VSTX_LANE(int, 16, 4, X);		\
-   DECL_VSTX_LANE(int, 32, 2, X);		\
-@@ -319,11 +355,20 @@ void exec_vstX_lane (void)
-   DECL_VSTX_LANE(poly, 16, 8, X);		\
-   DECL_VSTX_LANE(float, 32, 4, X)
- 
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+#define DECL_ALL_VSTX_LANE(X)		\
-+  DECL_ALL_VSTX_LANE_NO_FP16(X);	\
-+  DECL_VSTX_LANE(float, 16, 4, X);	\
-+  DECL_VSTX_LANE(float, 16, 8, X)
-+#else
-+#define DECL_ALL_VSTX_LANE(X) DECL_ALL_VSTX_LANE_NO_FP16(X)
-+#endif
-+
- #define DUMMY_ARRAY(V, T, W, N, L) VECT_VAR_DECL(V,T,W,N)[N*L]
- 
-   /* Use the same lanes regardless of the size of the array (X), for
-      simplicity.  */
--#define TEST_ALL_VSTX_LANE(X)			\
-+#define TEST_ALL_VSTX_LANE_NO_FP16(X)		\
-   TEST_VSTX_LANE(, int, s, 8, 8, X, 7);		\
-   TEST_VSTX_LANE(, int, s, 16, 4, X, 2);	\
-   TEST_VSTX_LANE(, int, s, 32, 2, X, 0);	\
-@@ -340,7 +385,16 @@ void exec_vstX_lane (void)
-   TEST_VSTX_LANE(q, poly, p, 16, 8, X, 5);	\
-   TEST_VSTX_LANE(q, float, f, 32, 4, X, 2)
- 
--#define TEST_ALL_EXTRA_CHUNKS(X, Y)		\
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+#define TEST_ALL_VSTX_LANE(X)			\
-+  TEST_ALL_VSTX_LANE_NO_FP16(X);		\
-+  TEST_VSTX_LANE(, float, f, 16, 4, X, 2);	\
-+  TEST_VSTX_LANE(q, float, f, 16, 8, X, 6)
-+#else
-+#define TEST_ALL_VSTX_LANE(X) TEST_ALL_VSTX_LANE_NO_FP16(X)
-+#endif
-+
-+#define TEST_ALL_EXTRA_CHUNKS_NO_FP16(X, Y)	\
-   TEST_EXTRA_CHUNK(int, 8, 8, X, Y);		\
-   TEST_EXTRA_CHUNK(int, 16, 4, X, Y);		\
-   TEST_EXTRA_CHUNK(int, 32, 2, X, Y);		\
-@@ -357,6 +411,15 @@ void exec_vstX_lane (void)
-   TEST_EXTRA_CHUNK(poly, 16, 8, X, Y);		\
-   TEST_EXTRA_CHUNK(float, 32, 4, X, Y)
- 
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+#define TEST_ALL_EXTRA_CHUNKS(X,Y)		\
-+  TEST_ALL_EXTRA_CHUNKS_NO_FP16(X, Y);		\
-+  TEST_EXTRA_CHUNK(float, 16, 4, X, Y);		\
-+  TEST_EXTRA_CHUNK(float, 16, 8, X, Y)
-+#else
-+#define TEST_ALL_EXTRA_CHUNKS(X,Y) TEST_ALL_EXTRA_CHUNKS_NO_FP16(X, Y)
-+#endif
-+
-   /* Declare the temporary buffers / variables.  */
-   DECL_ALL_VSTX_LANE(2);
-   DECL_ALL_VSTX_LANE(3);
-@@ -371,12 +434,18 @@ void exec_vstX_lane (void)
-   DUMMY_ARRAY(buffer_src, uint, 32, 2, 4);
-   DUMMY_ARRAY(buffer_src, poly, 8, 8, 4);
-   DUMMY_ARRAY(buffer_src, poly, 16, 4, 4);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  DUMMY_ARRAY(buffer_src, float, 16, 4, 4);
-+#endif
-   DUMMY_ARRAY(buffer_src, float, 32, 2, 4);
-   DUMMY_ARRAY(buffer_src, int, 16, 8, 4);
-   DUMMY_ARRAY(buffer_src, int, 32, 4, 4);
-   DUMMY_ARRAY(buffer_src, uint, 16, 8, 4);
-   DUMMY_ARRAY(buffer_src, uint, 32, 4, 4);
-   DUMMY_ARRAY(buffer_src, poly, 16, 8, 4);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  DUMMY_ARRAY(buffer_src, float, 16, 8, 4);
-+#endif
-   DUMMY_ARRAY(buffer_src, float, 32, 4, 4);
- 
-   /* Check vst2_lane/vst2q_lane.  */
-@@ -391,15 +460,19 @@ void exec_vstX_lane (void)
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st2_0, CMT);
-   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st2_0, CMT);
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st2_0, CMT);
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st2_0, CMT);
--  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st2_0, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st2_0, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st2_0, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st2_0, CMT);
-   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st2_0, CMT);
-   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st2_0, CMT);
-   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st2_0, CMT);
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st2_0, CMT);
--  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st2_0, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st2_0, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st2_0, CMT);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st2_0, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st2_0, CMT);
-+#endif
- 
-   TEST_ALL_EXTRA_CHUNKS(2, 1);
- #undef CMT
-@@ -410,15 +483,19 @@ void exec_vstX_lane (void)
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st2_1, CMT);
-   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st2_1, CMT);
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st2_1, CMT);
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st2_1, CMT);
--  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st2_1, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st2_1, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st2_1, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st2_1, CMT);
-   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st2_1, CMT);
-   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st2_1, CMT);
-   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st2_1, CMT);
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st2_1, CMT);
--  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st2_1, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st2_1, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st2_1, CMT);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st2_1, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st2_1, CMT);
-+#endif
- 
- 
-   /* Check vst3_lane/vst3q_lane.  */
-@@ -435,15 +512,19 @@ void exec_vstX_lane (void)
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st3_0, CMT);
-   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st3_0, CMT);
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st3_0, CMT);
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st3_0, CMT);
--  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st3_0, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st3_0, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st3_0, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st3_0, CMT);
-   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st3_0, CMT);
-   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st3_0, CMT);
-   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st3_0, CMT);
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st3_0, CMT);
--  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st3_0, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st3_0, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st3_0, CMT);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st3_0, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st3_0, CMT);
-+#endif
- 
-   TEST_ALL_EXTRA_CHUNKS(3, 1);
- 
-@@ -455,15 +536,19 @@ void exec_vstX_lane (void)
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st3_1, CMT);
-   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st3_1, CMT);
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st3_1, CMT);
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st3_1, CMT);
--  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st3_1, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st3_1, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st3_1, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st3_1, CMT);
-   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st3_1, CMT);
-   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st3_1, CMT);
-   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st3_1, CMT);
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st3_1, CMT);
--  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st3_1, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st3_1, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st3_1, CMT);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st3_1, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st3_1, CMT);
-+#endif
- 
-   TEST_ALL_EXTRA_CHUNKS(3, 2);
- 
-@@ -475,15 +560,19 @@ void exec_vstX_lane (void)
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st3_2, CMT);
-   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st3_2, CMT);
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st3_2, CMT);
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st3_2, CMT);
--  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st3_2, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st3_2, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st3_2, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st3_2, CMT);
-   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st3_2, CMT);
-   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st3_2, CMT);
-   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st3_2, CMT);
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st3_2, CMT);
--  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st3_2, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st3_2, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st3_2, CMT);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st3_2, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st3_2, CMT);
-+#endif
- 
- 
-   /* Check vst4_lane/vst4q_lane.  */
-@@ -500,15 +589,19 @@ void exec_vstX_lane (void)
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st4_0, CMT);
-   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st4_0, CMT);
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st4_0, CMT);
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_0, CMT);
--  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_0, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_0, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_0, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st4_0, CMT);
-   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st4_0, CMT);
-   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st4_0, CMT);
-   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st4_0, CMT);
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st4_0, CMT);
--  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_0, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_0, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st4_0, CMT);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st4_0, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st4_0, CMT);
-+#endif
- 
-   TEST_ALL_EXTRA_CHUNKS(4, 1);
- 
-@@ -520,15 +613,19 @@ void exec_vstX_lane (void)
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st4_1, CMT);
-   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st4_1, CMT);
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st4_1, CMT);
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_1, CMT);
--  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_1, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_1, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_1, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st4_1, CMT);
-   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st4_1, CMT);
-   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st4_1, CMT);
-   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st4_1, CMT);
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st4_1, CMT);
--  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_1, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_1, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st4_1, CMT);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st4_1, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st4_1, CMT);
-+#endif
- 
-   TEST_ALL_EXTRA_CHUNKS(4, 2);
- 
-@@ -540,15 +637,19 @@ void exec_vstX_lane (void)
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st4_2, CMT);
-   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st4_2, CMT);
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st4_2, CMT);
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_2, CMT);
--  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_2, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_2, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_2, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st4_2, CMT);
-   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st4_2, CMT);
-   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st4_2, CMT);
-   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st4_2, CMT);
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st4_2, CMT);
--  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_2, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_2, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st4_2, CMT);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st4_2, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st4_2, CMT);
-+#endif
- 
-   TEST_ALL_EXTRA_CHUNKS(4, 3);
- 
-@@ -560,15 +661,19 @@ void exec_vstX_lane (void)
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_st4_3, CMT);
-   CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_st4_3, CMT);
-   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st4_3, CMT);
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_3, CMT);
--  CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_3, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_3, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_3, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st4_3, CMT);
-   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st4_3, CMT);
-   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st4_3, CMT);
-   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_st4_3, CMT);
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_st4_3, CMT);
--  CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_3, CMT);
-+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected_st4_3, CMT);
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_st4_3, CMT);
-+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_st4_3, CMT);
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_st4_3, CMT);
-+#endif
- }
- 
- int main (void)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsub.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsub.c
-@@ -44,6 +44,14 @@ VECT_VAR_DECL(expected,uint,64,2) [] = { 0xffffffffffffffed,
- VECT_VAR_DECL(expected_float32,hfloat,32,2) [] = { 0xc00ccccd, 0xc00ccccd };
- VECT_VAR_DECL(expected_float32,hfloat,32,4) [] = { 0xc00ccccc, 0xc00ccccc,
- 						   0xc00ccccc, 0xc00ccccc };
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+VECT_VAR_DECL(expected_float16, hfloat, 16, 4) [] = { 0xc066, 0xc066,
-+						      0xc066, 0xc066 };
-+VECT_VAR_DECL(expected_float16, hfloat, 16, 8) [] = { 0xc067, 0xc067,
-+						      0xc067, 0xc067,
-+						      0xc067, 0xc067,
-+						      0xc067, 0xc067 };
-+#endif
- 
- void exec_vsub_f32(void)
- {
-@@ -67,4 +75,27 @@ void exec_vsub_f32(void)
- 
-   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_float32, "");
-   CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected_float32, "");
-+
-+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+  DECL_VARIABLE(vector, float, 16, 4);
-+  DECL_VARIABLE(vector, float, 16, 8);
-+
-+  DECL_VARIABLE(vector2, float, 16, 4);
-+  DECL_VARIABLE(vector2, float, 16, 8);
-+
-+  DECL_VARIABLE(vector_res, float, 16, 4);
-+  DECL_VARIABLE(vector_res, float, 16, 8);
-+
-+  VDUP(vector, , float, f, 16, 4, 2.3f);
-+  VDUP(vector, q, float, f, 16, 8, 3.4f);
-+
-+  VDUP(vector2, , float, f, 16, 4, 4.5f);
-+  VDUP(vector2, q, float, f, 16, 8, 5.6f);
-+
-+  TEST_BINARY_OP(INSN_NAME, , float, f, 16, 4);
-+  TEST_BINARY_OP(INSN_NAME, q, float, f, 16, 8);
-+
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected_float16, "");
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected_float16, "");
-+#endif
- }
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsubh_f16_1.c
-@@ -0,0 +1,42 @@
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_hw } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+#include <arm_fp16.h>
-+
-+#define INFF __builtin_inf ()
-+
-+/* Expected results (16-bit hexadecimal representation).  */
-+uint16_t expected[] =
-+{
-+  0xbc00 /* -1.000000 */,
-+  0xbc00 /* -1.000000 */,
-+  0x4654 /* 6.328125 */,
-+  0xd60e /* -96.875000 */,
-+  0xc900 /* -10.000000 */,
-+  0x36b8 /* 0.419922 */,
-+  0xc19a /* -2.800781 */,
-+  0x4848 /* 8.562500 */,
-+  0xbd34 /* -1.300781 */,
-+  0xccec /* -19.687500 */,
-+  0x4791 /* 7.566406 */,
-+  0xbf34 /* -1.800781 */,
-+  0x484d /* 8.601562 */,
-+  0x4804 /* 8.031250 */,
-+  0xc69c /* -6.609375 */,
-+  0x4ceb /* 19.671875 */,
-+  0x7c00 /* inf */,
-+  0xfc00 /* -inf */
-+};
-+
-+#define TEST_MSG "VSUB_F16"
-+#define INSN_NAME vsubh_f16
-+
-+#define EXPECTED expected
-+
-+#define INPUT_TYPE float16_t
-+#define OUTPUT_TYPE float16_t
-+#define OUTPUT_TYPE_SIZE 16
-+
-+/* Include the template for binary scalar operations.  */
-+#include "binary_scalar_op.inc"
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtbX.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtbX.c
-@@ -167,7 +167,7 @@ void exec_vtbX (void)
- 
-   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbl1, "");
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbl1, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl1, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl1, "");
- 
-   /* Check vtbl2.  */
-   clean_results ();
-@@ -177,7 +177,7 @@ void exec_vtbX (void)
- 
-   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbl2, "");
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbl2, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl2, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl2, "");
- 
-   /* Check vtbl3.  */
-   clean_results ();
-@@ -187,7 +187,7 @@ void exec_vtbX (void)
- 
-   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbl3, "");
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbl3, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl3, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl3, "");
- 
-   /* Check vtbl4.  */
-   clean_results ();
-@@ -197,7 +197,7 @@ void exec_vtbX (void)
- 
-   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbl4, "");
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbl4, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl4, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbl4, "");
- 
- 
-   /* Now test VTBX.  */
-@@ -249,7 +249,7 @@ void exec_vtbX (void)
- 
-   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbx1, "");
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbx1, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx1, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx1, "");
- 
-   /* Check vtbx2.  */
-   clean_results ();
-@@ -259,7 +259,7 @@ void exec_vtbX (void)
- 
-   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbx2, "");
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbx2, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx2, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx2, "");
- 
-   /* Check vtbx3.  */
-   clean_results ();
-@@ -269,7 +269,7 @@ void exec_vtbX (void)
- 
-   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbx3, "");
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbx3, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx3, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx3, "");
- 
-   /* Check vtbx4.  */
-   clean_results ();
-@@ -279,7 +279,7 @@ void exec_vtbX (void)
- 
-   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected_vtbx4, "");
-   CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_vtbx4, "");
--  CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx4, "");
-+  CHECK_POLY(TEST_MSG, poly, 8, 8, PRIx8, expected_vtbx4, "");
- }
- 
- int main (void)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtrn.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtrn.c
-@@ -15,6 +15,10 @@ VECT_VAR_DECL(expected0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
- VECT_VAR_DECL(expected0,poly,8,8) [] = { 0xf0, 0xf1, 0x55, 0x55,
- 					 0xf2, 0xf3, 0x55, 0x55 };
- VECT_VAR_DECL(expected0,poly,16,4) [] = { 0xfff0, 0xfff1, 0x66, 0x66 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected0, hfloat, 16, 4) [] = { 0xcc00, 0xcb80,
-+						0x4b4d, 0x4b4d };
-+#endif
- VECT_VAR_DECL(expected0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
- VECT_VAR_DECL(expected0,int,8,16) [] = { 0xf0, 0xf1, 0x11, 0x11,
- 					 0xf2, 0xf3, 0x11, 0x11,
-@@ -36,6 +40,12 @@ VECT_VAR_DECL(expected0,poly,8,16) [] = { 0xf0, 0xf1, 0x55, 0x55,
- 					  0xf6, 0xf7, 0x55, 0x55 };
- VECT_VAR_DECL(expected0,poly,16,8) [] = { 0xfff0, 0xfff1, 0x66, 0x66,
- 					  0xfff2, 0xfff3, 0x66, 0x66 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected0, hfloat, 16, 8) [] = { 0xcc00, 0xcb80,
-+						0x4b4d, 0x4b4d,
-+						0xcb00, 0xca80,
-+						0x4b4d, 0x4b4d };
-+#endif
- VECT_VAR_DECL(expected0,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
- 					    0x42073333, 0x42073333 };
- 
-@@ -51,6 +61,10 @@ VECT_VAR_DECL(expected1,uint,32,2) [] = { 0x77, 0x77 };
- VECT_VAR_DECL(expected1,poly,8,8) [] = { 0xf4, 0xf5, 0x55, 0x55,
- 					 0xf6, 0xf7, 0x55, 0x55 };
- VECT_VAR_DECL(expected1,poly,16,4) [] = { 0xfff2, 0xfff3, 0x66, 0x66 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected1, hfloat, 16, 4) [] = { 0xcb00, 0xca80,
-+						0x4b4d, 0x4b4d };
-+#endif
- VECT_VAR_DECL(expected1,hfloat,32,2) [] = { 0x42066666, 0x42066666 };
- VECT_VAR_DECL(expected1,int,8,16) [] = { 0xf8, 0xf9, 0x11, 0x11,
- 					 0xfa, 0xfb, 0x11, 0x11,
-@@ -72,6 +86,12 @@ VECT_VAR_DECL(expected1,poly,8,16) [] = { 0xf8, 0xf9, 0x55, 0x55,
- 					  0xfe, 0xff, 0x55, 0x55 };
- VECT_VAR_DECL(expected1,poly,16,8) [] = { 0xfff4, 0xfff5, 0x66, 0x66,
- 					  0xfff6, 0xfff7, 0x66, 0x66 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected1, hfloat, 16, 8) [] = { 0xca00, 0xc980,
-+						0x4b4d, 0x4b4d,
-+						0xc900, 0xc880,
-+						0x4b4d, 0x4b4d };
-+#endif
- VECT_VAR_DECL(expected1,hfloat,32,4) [] = { 0xc1600000, 0xc1500000,
- 					    0x42073333, 0x42073333 };
- 
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtrn_half.c
-@@ -0,0 +1,263 @@
-+/* { dg-do run } */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+/* Expected results.  */
-+VECT_VAR_DECL(expected,int,8,8) [] = { 0xf0, 0x11, 0xf2, 0x11,
-+				       0xf4, 0x11, 0xf6, 0x11 };
-+VECT_VAR_DECL(expected,int,16,4) [] = { 0xfff0, 0x22, 0xfff2, 0x22 };
-+VECT_VAR_DECL(expected,int,32,2) [] = { 0xfffffff0, 0x33 };
-+VECT_VAR_DECL(expected,int,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(expected,uint,8,8) [] = { 0xf0, 0x55, 0xf2, 0x55,
-+					0xf4, 0x55, 0xf6, 0x55 };
-+VECT_VAR_DECL(expected,uint,16,4) [] = { 0xfff0, 0x66, 0xfff2, 0x66 };
-+VECT_VAR_DECL(expected,uint,32,2) [] = { 0xfffffff0, 0x77 };
-+VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf0, 0x55, 0xf2, 0x55,
-+					0xf4, 0x55, 0xf6, 0x55 };
-+VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff0, 0x66, 0xfff2, 0x66 };
-+VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800000, 0x42066666 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcc00, 0x4b4d,
-+					       0xcb00, 0x4b4d };
-+#endif
-+VECT_VAR_DECL(expected,int,8,16) [] = { 0xf0, 0x11, 0xf2, 0x11,
-+					0xf4, 0x11, 0xf6, 0x11,
-+					0xf8, 0x11, 0xfa, 0x11,
-+					0xfc, 0x11, 0xfe, 0x11 };
-+VECT_VAR_DECL(expected,int,16,8) [] = { 0xfff0, 0x22, 0xfff2, 0x22,
-+					0xfff4, 0x22, 0xfff6, 0x22 };
-+VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffffff0, 0x33,
-+					0xfffffff2, 0x33 };
-+VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffffff0,
-+					0x44 };
-+VECT_VAR_DECL(expected,uint,8,16) [] = { 0xf0, 0x55, 0xf2, 0x55,
-+					 0xf4, 0x55, 0xf6, 0x55,
-+					 0xf8, 0x55, 0xfa, 0x55,
-+					 0xfc, 0x55, 0xfe, 0x55 };
-+VECT_VAR_DECL(expected,uint,16,8) [] = { 0xfff0, 0x66, 0xfff2, 0x66,
-+					 0xfff4, 0x66, 0xfff6, 0x66 };
-+VECT_VAR_DECL(expected,uint,32,4) [] = { 0xfffffff0, 0x77,
-+					 0xfffffff2, 0x77 };
-+VECT_VAR_DECL(expected,uint,64,2) [] = { 0xfffffffffffffff0,
-+					 0x88 };
-+VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf0, 0x55, 0xf2, 0x55,
-+					 0xf4, 0x55, 0xf6, 0x55,
-+					 0xf8, 0x55, 0xfa, 0x55,
-+					 0xfc, 0x55, 0xfe, 0x55 };
-+VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff0, 0x66, 0xfff2, 0x66,
-+					 0xfff4, 0x66, 0xfff6, 0x66 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xcc00, 0x4b4d,
-+					       0xcb00, 0x4b4d,
-+					       0xca00, 0x4b4d,
-+					       0xc900, 0x4b4d };
-+#endif
-+VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1800000, 0x42073333,
-+					   0xc1600000, 0x42073333 };
-+
-+#define TEST_MSG "VTRN1"
-+void exec_vtrn_half (void)
-+{
-+#define TEST_VTRN(PART, Q, T1, T2, W, N)		\
-+  VECT_VAR(vector_res, T1, W, N) =			\
-+    vtrn##PART##Q##_##T2##W(VECT_VAR(vector, T1, W, N),	\
-+		       VECT_VAR(vector2, T1, W, N));	\
-+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res, T1, W, N))
-+
-+#define TEST_VTRN1(Q, T1, T2, W, N) TEST_VTRN(1, Q, T1, T2, W, N)
-+
-+  /* Input vector can only have 64 bits.  */
-+  DECL_VARIABLE_ALL_VARIANTS(vector);
-+  DECL_VARIABLE_ALL_VARIANTS(vector2);
-+  DECL_VARIABLE(vector, float, 64, 2);
-+  DECL_VARIABLE(vector2, float, 64, 2);
-+
-+  DECL_VARIABLE_ALL_VARIANTS(vector_res);
-+  DECL_VARIABLE(vector_res, float, 64, 2);
-+
-+  clean_results ();
-+  /* We don't have vtrn1_T64x1, so set expected to the clean value.  */
-+  CLEAN(expected, int, 64, 1);
-+  CLEAN(expected, uint, 64, 1);
-+
-+  TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
-+#if defined (FP16_SUPPORTED)
-+  VLOAD(vector, buffer, , float, f, 16, 4);
-+  VLOAD(vector, buffer, q, float, f, 16, 8);
-+#endif
-+  VLOAD(vector, buffer, , float, f, 32, 2);
-+  VLOAD(vector, buffer, q, float, f, 32, 4);
-+  VLOAD(vector, buffer, q, float, f, 64, 2);
-+
-+  /* Choose arbitrary initialization values.  */
-+  VDUP(vector2, , int, s, 8, 8, 0x11);
-+  VDUP(vector2, , int, s, 16, 4, 0x22);
-+  VDUP(vector2, , int, s, 32, 2, 0x33);
-+  VDUP(vector2, , uint, u, 8, 8, 0x55);
-+  VDUP(vector2, , uint, u, 16, 4, 0x66);
-+  VDUP(vector2, , uint, u, 32, 2, 0x77);
-+  VDUP(vector2, , poly, p, 8, 8, 0x55);
-+  VDUP(vector2, , poly, p, 16, 4, 0x66);
-+#if defined (FP16_SUPPORTED)
-+  VDUP (vector2, , float, f, 16, 4, 14.6f);   /* 14.6f is 0x4b4d.  */
-+#endif
-+  VDUP(vector2, , float, f, 32, 2, 33.6f);
-+
-+  VDUP(vector2, q, int, s, 8, 16, 0x11);
-+  VDUP(vector2, q, int, s, 16, 8, 0x22);
-+  VDUP(vector2, q, int, s, 32, 4, 0x33);
-+  VDUP(vector2, q, int, s, 64, 2, 0x44);
-+  VDUP(vector2, q, uint, u, 8, 16, 0x55);
-+  VDUP(vector2, q, uint, u, 16, 8, 0x66);
-+  VDUP(vector2, q, uint, u, 32, 4, 0x77);
-+  VDUP(vector2, q, uint, u, 64, 2, 0x88);
-+  VDUP(vector2, q, poly, p, 8, 16, 0x55);
-+  VDUP(vector2, q, poly, p, 16, 8, 0x66);
-+#if defined (FP16_SUPPORTED)
-+  VDUP (vector2, q, float, f, 16, 8, 14.6f);
-+#endif
-+  VDUP(vector2, q, float, f, 32, 4, 33.8f);
-+  VDUP(vector2, q, float, f, 64, 2, 33.8f);
-+
-+  TEST_VTRN1(, int, s, 8, 8);
-+  TEST_VTRN1(, int, s, 16, 4);
-+  TEST_VTRN1(, int, s, 32, 2);
-+  TEST_VTRN1(, uint, u, 8, 8);
-+  TEST_VTRN1(, uint, u, 16, 4);
-+  TEST_VTRN1(, uint, u, 32, 2);
-+  TEST_VTRN1(, poly, p, 8, 8);
-+  TEST_VTRN1(, poly, p, 16, 4);
-+#if defined (FP16_SUPPORTED)
-+  TEST_VTRN1(, float, f, 16, 4);
-+#endif
-+  TEST_VTRN1(, float, f, 32, 2);
-+
-+  TEST_VTRN1(q, int, s, 8, 16);
-+  TEST_VTRN1(q, int, s, 16, 8);
-+  TEST_VTRN1(q, int, s, 32, 4);
-+  TEST_VTRN1(q, int, s, 64, 2);
-+  TEST_VTRN1(q, uint, u, 8, 16);
-+  TEST_VTRN1(q, uint, u, 16, 8);
-+  TEST_VTRN1(q, uint, u, 32, 4);
-+  TEST_VTRN1(q, uint, u, 64, 2);
-+  TEST_VTRN1(q, poly, p, 8, 16);
-+  TEST_VTRN1(q, poly, p, 16, 8);
-+#if defined (FP16_SUPPORTED)
-+  TEST_VTRN1(q, float, f, 16, 8);
-+#endif
-+  TEST_VTRN1(q, float, f, 32, 4);
-+  TEST_VTRN1(q, float, f, 64, 2);
-+
-+#if defined (FP16_SUPPORTED)
-+  CHECK_RESULTS (TEST_MSG, "");
-+#else
-+  CHECK_RESULTS_NO_FP16 (TEST_MSG, "");
-+#endif
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VTRN2"
-+
-+#define TEST_VTRN2(Q, T1, T2, W, N) TEST_VTRN(2, Q, T1, T2, W, N)
-+
-+/* Expected results.  */
-+VECT_VAR_DECL(expected2,int,8,8) [] = { 0xf1, 0x11, 0xf3, 0x11,
-+					0xf5, 0x11, 0xf7, 0x11 };
-+VECT_VAR_DECL(expected2,int,16,4) [] = { 0xfff1, 0x22, 0xfff3, 0x22 };
-+VECT_VAR_DECL(expected2,int,32,2) [] = { 0xfffffff1, 0x33 };
-+VECT_VAR_DECL(expected2,int,64,1) [] = { 0xfffffffffffffff1 };
-+VECT_VAR_DECL(expected2,uint,8,8) [] = { 0xf1, 0x55, 0xf3, 0x55,
-+					 0xf5, 0x55, 0xf7, 0x55 };
-+VECT_VAR_DECL(expected2,uint,16,4) [] = { 0xfff1, 0x66, 0xfff3, 0x66 };
-+VECT_VAR_DECL(expected2,uint,32,2) [] = { 0xfffffff1, 0x77 };
-+VECT_VAR_DECL(expected2,uint,64,1) [] = { 0xfffffffffffffff1 };
-+VECT_VAR_DECL(expected2,poly,8,8) [] = { 0xf1, 0x55, 0xf3, 0x55,
-+					 0xf5, 0x55, 0xf7, 0x55 };
-+VECT_VAR_DECL(expected2,poly,16,4) [] = { 0xfff1, 0x66, 0xfff3, 0x66 };
-+VECT_VAR_DECL(expected2,hfloat,32,2) [] = { 0xc1700000, 0x42066666 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected2, hfloat, 16, 4) [] = { 0xcb80, 0x4b4d,
-+						0xca80, 0x4b4d };
-+#endif
-+VECT_VAR_DECL(expected2,int,8,16) [] = { 0xf1, 0x11, 0xf3, 0x11,
-+					 0xf5, 0x11, 0xf7, 0x11,
-+					 0xf9, 0x11, 0xfb, 0x11,
-+					 0xfd, 0x11, 0xff, 0x11 };
-+VECT_VAR_DECL(expected2,int,16,8) [] = { 0xfff1, 0x22, 0xfff3, 0x22,
-+					 0xfff5, 0x22, 0xfff7, 0x22 };
-+VECT_VAR_DECL(expected2,int,32,4) [] = { 0xfffffff1, 0x33,
-+					 0xfffffff3, 0x33 };
-+VECT_VAR_DECL(expected2,int,64,2) [] = { 0xfffffffffffffff1,
-+					 0x44 };
-+VECT_VAR_DECL(expected2,uint,8,16) [] = { 0xf1, 0x55, 0xf3, 0x55,
-+					  0xf5, 0x55, 0xf7, 0x55,
-+					  0xf9, 0x55, 0xfb, 0x55,
-+					  0xfd, 0x55, 0xff, 0x55 };
-+VECT_VAR_DECL(expected2,uint,16,8) [] = { 0xfff1, 0x66, 0xfff3, 0x66,
-+					  0xfff5, 0x66, 0xfff7, 0x66 };
-+VECT_VAR_DECL(expected2,uint,32,4) [] = { 0xfffffff1, 0x77,
-+					  0xfffffff3, 0x77 };
-+VECT_VAR_DECL(expected2,uint,64,2) [] = { 0xfffffffffffffff1,
-+					  0x88 };
-+VECT_VAR_DECL(expected2,poly,8,16) [] = { 0xf1, 0x55, 0xf3, 0x55,
-+					  0xf5, 0x55, 0xf7, 0x55,
-+					  0xf9, 0x55, 0xfb, 0x55,
-+					  0xfd, 0x55, 0xff, 0x55 };
-+VECT_VAR_DECL(expected2,poly,16,8) [] = { 0xfff1, 0x66, 0xfff3, 0x66,
-+					  0xfff5, 0x66, 0xfff7, 0x66 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected2, hfloat, 16, 8) [] = { 0xcb80, 0x4b4d,
-+						0xca80, 0x4b4d,
-+						0xc980, 0x4b4d,
-+						0xc880, 0x4b4d };
-+#endif
-+VECT_VAR_DECL(expected2,hfloat,32,4) [] = { 0xc1700000, 0x42073333,
-+					    0xc1500000, 0x42073333 };
-+  clean_results ();
-+  CLEAN(expected2, int, 64, 1);
-+  CLEAN(expected2, uint, 64, 1);
-+
-+  TEST_VTRN2(, int, s, 8, 8);
-+  TEST_VTRN2(, int, s, 16, 4);
-+  TEST_VTRN2(, int, s, 32, 2);
-+  TEST_VTRN2(, uint, u, 8, 8);
-+  TEST_VTRN2(, uint, u, 16, 4);
-+  TEST_VTRN2(, uint, u, 32, 2);
-+  TEST_VTRN2(, poly, p, 8, 8);
-+  TEST_VTRN2(, poly, p, 16, 4);
-+#if defined (FP16_SUPPORTED)
-+  TEST_VTRN2(, float, f, 16, 4);
-+#endif
-+  TEST_VTRN2(, float, f, 32, 2);
-+
-+  TEST_VTRN2(q, int, s, 8, 16);
-+  TEST_VTRN2(q, int, s, 16, 8);
-+  TEST_VTRN2(q, int, s, 32, 4);
-+  TEST_VTRN2(q, int, s, 64, 2);
-+  TEST_VTRN2(q, uint, u, 8, 16);
-+  TEST_VTRN2(q, uint, u, 16, 8);
-+  TEST_VTRN2(q, uint, u, 32, 4);
-+  TEST_VTRN2(q, uint, u, 64, 2);
-+  TEST_VTRN2(q, poly, p, 8, 16);
-+  TEST_VTRN2(q, poly, p, 16, 8);
-+#if defined (FP16_SUPPORTED)
-+  TEST_VTRN2(q, float, f, 16, 8);
-+#endif
-+  TEST_VTRN2(q, float, f, 32, 4);
-+  TEST_VTRN2(q, float, f, 64, 2);
-+
-+  CHECK_RESULTS_NAMED (TEST_MSG, expected2, "");
-+#if defined (FP16_SUPPORTED)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected2, "");
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected2, "");
-+#endif
-+}
-+
-+int main (void)
-+{
-+  exec_vtrn_half ();
-+  return 0;
-+}
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtst.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtst.c
-@@ -32,10 +32,21 @@ VECT_VAR_DECL(expected_unsigned,uint,16,8) [] = { 0x0, 0xffff,
- VECT_VAR_DECL(expected_unsigned,uint,32,4) [] = { 0x0, 0xffffffff,
- 						  0x0, 0xffffffff };
- 
--#ifndef INSN_NAME
-+/* Expected results with poly input.  */
-+VECT_VAR_DECL(expected_poly,uint,8,8) [] = { 0x0, 0xff, 0xff, 0xff,
-+					     0xff, 0xff, 0xff, 0xff };
-+VECT_VAR_DECL(expected_poly,uint,8,16) [] = { 0x0, 0xff, 0xff, 0xff,
-+					      0xff, 0xff, 0xff, 0xff,
-+					      0xff, 0xff, 0xff, 0xff,
-+					      0xff, 0xff, 0xff, 0xff };
-+VECT_VAR_DECL(expected_poly,uint,16,4) [] = { 0x0, 0xffff, 0x0, 0xffff };
-+VECT_VAR_DECL(expected_poly,uint,16,8) [] = { 0x0, 0xffff,
-+					      0x0, 0xffff,
-+					      0xffff, 0xffff,
-+					      0xffff, 0xffff };
-+
- #define INSN_NAME vtst
- #define TEST_MSG "VTST/VTSTQ"
--#endif
- 
- /* We can't use the standard ref_v_binary_op.c template because vtst
-    has no 64 bits variant, and outputs are always of uint type.  */
-@@ -73,12 +84,16 @@ FNNAME (INSN_NAME)
-   VDUP(vector2, , uint, u, 8, 8, 15);
-   VDUP(vector2, , uint, u, 16, 4, 5);
-   VDUP(vector2, , uint, u, 32, 2, 1);
-+  VDUP(vector2, , poly, p, 8, 8, 15);
-+  VDUP(vector2, , poly, p, 16, 4, 5);
-   VDUP(vector2, q, int, s, 8, 16, 15);
-   VDUP(vector2, q, int, s, 16, 8, 5);
-   VDUP(vector2, q, int, s, 32, 4, 1);
-   VDUP(vector2, q, uint, u, 8, 16, 15);
-   VDUP(vector2, q, uint, u, 16, 8, 5);
-   VDUP(vector2, q, uint, u, 32, 4, 1);
-+  VDUP(vector2, q, poly, p, 8, 16, 15);
-+  VDUP(vector2, q, poly, p, 16, 8, 5);
- 
- #define TEST_MACRO_NO64BIT_VARIANT_1_5(MACRO, VAR, T1, T2)	\
-   MACRO(VAR, , T1, T2, 8, 8);					\
-@@ -111,6 +126,18 @@ FNNAME (INSN_NAME)
-   CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_unsigned, CMT);
-   CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_unsigned, CMT);
-   CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_unsigned, CMT);
-+
-+  /* Now, test the variants with poly8 and poly16 as input.  */
-+#undef CMT
-+#define CMT " (poly input)"
-+  TEST_BINARY_OP(INSN_NAME, , poly, p, 8, 8);
-+  TEST_BINARY_OP(INSN_NAME, , poly, p, 16, 4);
-+  TEST_BINARY_OP(INSN_NAME, q, poly, p, 8, 16);
-+  TEST_BINARY_OP(INSN_NAME, q, poly, p, 16, 8);
-+  CHECK(TEST_MSG, uint, 8, 8, PRIx8, expected_poly, CMT);
-+  CHECK(TEST_MSG, uint, 16, 4, PRIx16, expected_poly, CMT);
-+  CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_poly, CMT);
-+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_poly, CMT);
- }
- 
- int main (void)
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vuzp.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vuzp.c
-@@ -19,6 +19,10 @@ VECT_VAR_DECL(expected0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
- 					 0xf4, 0xf5, 0xf6, 0xf7 };
- VECT_VAR_DECL(expected0,poly,16,4) [] = { 0xfff0, 0xfff1,
- 					  0xfff2, 0xfff3 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected0, hfloat, 16, 4) [] = { 0xcc00, 0xcb80,
-+						0xcb00, 0xca80 };
-+#endif
- VECT_VAR_DECL(expected0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
- VECT_VAR_DECL(expected0,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
- 					 0xf4, 0xf5, 0xf6, 0xf7,
-@@ -48,6 +52,12 @@ VECT_VAR_DECL(expected0,poly,16,8) [] = { 0xfff0, 0xfff1,
- 					  0xfff2, 0xfff3,
- 					  0xfff4, 0xfff5,
- 					  0xfff6, 0xfff7 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected0, hfloat, 16, 8) [] = { 0xcc00, 0xcb80,
-+						0xcb00, 0xca80,
-+						0xca00, 0xc980,
-+						0xc900, 0xc880 };
-+#endif
- VECT_VAR_DECL(expected0,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
- 					    0xc1600000, 0xc1500000 };
- 
-@@ -63,6 +73,10 @@ VECT_VAR_DECL(expected1,uint,32,2) [] = { 0x77, 0x77 };
- VECT_VAR_DECL(expected1,poly,8,8) [] = { 0x55, 0x55, 0x55, 0x55,
- 					 0x55, 0x55, 0x55, 0x55 };
- VECT_VAR_DECL(expected1,poly,16,4) [] = { 0x66, 0x66, 0x66, 0x66 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected1, hfloat, 16, 4) [] = { 0x4b4d, 0x4b4d,
-+						0x4b4d, 0x4b4d };
-+#endif
- VECT_VAR_DECL(expected1,hfloat,32,2) [] = { 0x42066666, 0x42066666 };
- VECT_VAR_DECL(expected1,int,8,16) [] = { 0x11, 0x11, 0x11, 0x11,
- 					 0x11, 0x11, 0x11, 0x11,
-@@ -84,6 +98,12 @@ VECT_VAR_DECL(expected1,poly,8,16) [] = { 0x55, 0x55, 0x55, 0x55,
- 					  0x55, 0x55, 0x55, 0x55 };
- VECT_VAR_DECL(expected1,poly,16,8) [] = { 0x66, 0x66, 0x66, 0x66,
- 					  0x66, 0x66, 0x66, 0x66 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected1, hfloat, 16, 8) [] = { 0x4b4d, 0x4b4d,
-+						0x4b4d, 0x4b4d,
-+						0x4b4d, 0x4b4d,
-+						0x4b4d, 0x4b4d };
-+#endif
- VECT_VAR_DECL(expected1,hfloat,32,4) [] = { 0x42073333, 0x42073333,
- 					    0x42073333, 0x42073333 };
- 
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vuzp_half.c
-@@ -0,0 +1,259 @@
-+/* { dg-do run } */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+/* Expected results.  */
-+VECT_VAR_DECL(expected,int,8,8) [] = { 0xf0, 0xf2, 0xf4, 0xf6,
-+				       0x11, 0x11, 0x11, 0x11 };
-+VECT_VAR_DECL(expected,int,16,4) [] = { 0xfff0, 0xfff2, 0x22, 0x22 };
-+VECT_VAR_DECL(expected,int,32,2) [] = { 0xfffffff0, 0x33 };
-+VECT_VAR_DECL(expected,int,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(expected,uint,8,8) [] = { 0xf0, 0xf2, 0xf4, 0xf6,
-+					0x55, 0x55, 0x55, 0x55 };
-+VECT_VAR_DECL(expected,uint,16,4) [] = { 0xfff0, 0xfff2, 0x66, 0x66 };
-+VECT_VAR_DECL(expected,uint,32,2) [] = { 0xfffffff0, 0x77 };
-+VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf0, 0xf2, 0xf4, 0xf6,
-+					0x55, 0x55, 0x55, 0x55 };
-+VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff0, 0xfff2, 0x66, 0x66 };
-+VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800000, 0x42066666 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcc00, 0xcb00,
-+					       0x4b4d, 0x4b4d };
-+#endif
-+VECT_VAR_DECL(expected,int,8,16) [] = { 0xf0, 0xf2, 0xf4, 0xf6,
-+					0xf8, 0xfa, 0xfc, 0xfe,
-+					0x11, 0x11, 0x11, 0x11,
-+					0x11, 0x11, 0x11, 0x11 };
-+VECT_VAR_DECL(expected,int,16,8) [] = { 0xfff0, 0xfff2, 0xfff4, 0xfff6,
-+					0x22, 0x22, 0x22, 0x22 };
-+VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffffff0, 0xfffffff2,
-+					0x33, 0x33 };
-+VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffffff0,
-+					0x44 };
-+VECT_VAR_DECL(expected,uint,8,16) [] = { 0xf0, 0xf2, 0xf4, 0xf6,
-+					 0xf8, 0xfa, 0xfc, 0xfe,
-+					 0x55, 0x55, 0x55, 0x55,
-+					 0x55, 0x55, 0x55, 0x55 };
-+VECT_VAR_DECL(expected,uint,16,8) [] = { 0xfff0, 0xfff2, 0xfff4, 0xfff6,
-+					 0x66, 0x66, 0x66, 0x66 };
-+VECT_VAR_DECL(expected,uint,32,4) [] = { 0xfffffff0, 0xfffffff2, 0x77, 0x77 };
-+VECT_VAR_DECL(expected,uint,64,2) [] = { 0xfffffffffffffff0,
-+					 0x88 };
-+VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf0, 0xf2, 0xf4, 0xf6,
-+					 0xf8, 0xfa, 0xfc, 0xfe,
-+					 0x55, 0x55, 0x55, 0x55,
-+					 0x55, 0x55, 0x55, 0x55 };
-+VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff0, 0xfff2, 0xfff4, 0xfff6,
-+					 0x66, 0x66, 0x66, 0x66 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xcc00, 0xcb00, 0xca00, 0xc900,
-+					       0x4b4d, 0x4b4d, 0x4b4d, 0x4b4d };
-+#endif
-+VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1800000, 0xc1600000,
-+					   0x42073333, 0x42073333 };
-+
-+#define TEST_MSG "VUZP1"
-+void exec_vuzp_half (void)
-+{
-+#define TEST_VUZP(PART, Q, T1, T2, W, N)		\
-+  VECT_VAR(vector_res, T1, W, N) =			\
-+    vuzp##PART##Q##_##T2##W(VECT_VAR(vector, T1, W, N),	\
-+		       VECT_VAR(vector2, T1, W, N));	\
-+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res, T1, W, N))
-+
-+#define TEST_VUZP1(Q, T1, T2, W, N) TEST_VUZP(1, Q, T1, T2, W, N)
-+
-+  /* Input vector can only have 64 bits.  */
-+  DECL_VARIABLE_ALL_VARIANTS(vector);
-+  DECL_VARIABLE_ALL_VARIANTS(vector2);
-+  DECL_VARIABLE(vector, float, 64, 2);
-+  DECL_VARIABLE(vector2, float, 64, 2);
-+
-+  DECL_VARIABLE_ALL_VARIANTS(vector_res);
-+  DECL_VARIABLE(vector_res, float, 64, 2);
-+
-+  clean_results ();
-+  /* We don't have vuzp1_T64x1, so set expected to the clean value.  */
-+  CLEAN(expected, int, 64, 1);
-+  CLEAN(expected, uint, 64, 1);
-+
-+  TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
-+#if defined (FP16_SUPPORTED)
-+  VLOAD(vector, buffer, , float, f, 16, 4);
-+  VLOAD(vector, buffer, q, float, f, 16, 8);
-+#endif
-+  VLOAD(vector, buffer, , float, f, 32, 2);
-+  VLOAD(vector, buffer, q, float, f, 32, 4);
-+  VLOAD(vector, buffer, q, float, f, 64, 2);
-+
-+  /* Choose arbitrary initialization values.  */
-+  VDUP(vector2, , int, s, 8, 8, 0x11);
-+  VDUP(vector2, , int, s, 16, 4, 0x22);
-+  VDUP(vector2, , int, s, 32, 2, 0x33);
-+  VDUP(vector2, , uint, u, 8, 8, 0x55);
-+  VDUP(vector2, , uint, u, 16, 4, 0x66);
-+  VDUP(vector2, , uint, u, 32, 2, 0x77);
-+  VDUP(vector2, , poly, p, 8, 8, 0x55);
-+  VDUP(vector2, , poly, p, 16, 4, 0x66);
-+#if defined (FP16_SUPPORTED)
-+  VDUP (vector2, , float, f, 16, 4, 14.6f);   /* 14.6f is 0x4b4d.  */
-+#endif
-+  VDUP(vector2, , float, f, 32, 2, 33.6f);
-+
-+  VDUP(vector2, q, int, s, 8, 16, 0x11);
-+  VDUP(vector2, q, int, s, 16, 8, 0x22);
-+  VDUP(vector2, q, int, s, 32, 4, 0x33);
-+  VDUP(vector2, q, int, s, 64, 2, 0x44);
-+  VDUP(vector2, q, uint, u, 8, 16, 0x55);
-+  VDUP(vector2, q, uint, u, 16, 8, 0x66);
-+  VDUP(vector2, q, uint, u, 32, 4, 0x77);
-+  VDUP(vector2, q, uint, u, 64, 2, 0x88);
-+  VDUP(vector2, q, poly, p, 8, 16, 0x55);
-+  VDUP(vector2, q, poly, p, 16, 8, 0x66);
-+#if defined (FP16_SUPPORTED)
-+  VDUP (vector2, q, float, f, 16, 8, 14.6f);
-+#endif
-+  VDUP(vector2, q, float, f, 32, 4, 33.8f);
-+  VDUP(vector2, q, float, f, 64, 2, 33.8f);
-+
-+  TEST_VUZP1(, int, s, 8, 8);
-+  TEST_VUZP1(, int, s, 16, 4);
-+  TEST_VUZP1(, int, s, 32, 2);
-+  TEST_VUZP1(, uint, u, 8, 8);
-+  TEST_VUZP1(, uint, u, 16, 4);
-+  TEST_VUZP1(, uint, u, 32, 2);
-+  TEST_VUZP1(, poly, p, 8, 8);
-+  TEST_VUZP1(, poly, p, 16, 4);
-+#if defined (FP16_SUPPORTED)
-+  TEST_VUZP1(, float, f, 16, 4);
-+#endif
-+  TEST_VUZP1(, float, f, 32, 2);
-+
-+  TEST_VUZP1(q, int, s, 8, 16);
-+  TEST_VUZP1(q, int, s, 16, 8);
-+  TEST_VUZP1(q, int, s, 32, 4);
-+  TEST_VUZP1(q, int, s, 64, 2);
-+  TEST_VUZP1(q, uint, u, 8, 16);
-+  TEST_VUZP1(q, uint, u, 16, 8);
-+  TEST_VUZP1(q, uint, u, 32, 4);
-+  TEST_VUZP1(q, uint, u, 64, 2);
-+  TEST_VUZP1(q, poly, p, 8, 16);
-+  TEST_VUZP1(q, poly, p, 16, 8);
-+#if defined (FP16_SUPPORTED)
-+  TEST_VUZP1(q, float, f, 16, 8);
-+#endif
-+  TEST_VUZP1(q, float, f, 32, 4);
-+  TEST_VUZP1(q, float, f, 64, 2);
-+
-+#if defined (FP16_SUPPORTED)
-+  CHECK_RESULTS (TEST_MSG, "");
-+#else
-+  CHECK_RESULTS_NO_FP16 (TEST_MSG, "");
-+#endif
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VUZP2"
-+
-+#define TEST_VUZP2(Q, T1, T2, W, N) TEST_VUZP(2, Q, T1, T2, W, N)
-+
-+/* Expected results.  */
-+VECT_VAR_DECL(expected2,int,8,8) [] = { 0xf1, 0xf3, 0xf5, 0xf7,
-+					0x11, 0x11, 0x11, 0x11 };
-+VECT_VAR_DECL(expected2,int,16,4) [] = { 0xfff1, 0xfff3, 0x22, 0x22 };
-+VECT_VAR_DECL(expected2,int,32,2) [] = { 0xfffffff1, 0x33 };
-+VECT_VAR_DECL(expected2,int,64,1) [] = { 0xfffffffffffffff1 };
-+VECT_VAR_DECL(expected2,uint,8,8) [] = { 0xf1, 0xf3, 0xf5, 0xf7,
-+					 0x55, 0x55, 0x55, 0x55 };
-+VECT_VAR_DECL(expected2,uint,16,4) [] = { 0xfff1, 0xfff3, 0x66, 0x66 };
-+VECT_VAR_DECL(expected2,uint,32,2) [] = { 0xfffffff1, 0x77 };
-+VECT_VAR_DECL(expected2,uint,64,1) [] = { 0xfffffffffffffff1 };
-+VECT_VAR_DECL(expected2,poly,8,8) [] = { 0xf1, 0xf3, 0xf5, 0xf7,
-+					 0x55, 0x55, 0x55, 0x55 };
-+VECT_VAR_DECL(expected2,poly,16,4) [] = { 0xfff1, 0xfff3, 0x66, 0x66 };
-+VECT_VAR_DECL(expected2,hfloat,32,2) [] = { 0xc1700000, 0x42066666 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected2, hfloat, 16, 4) [] = { 0xcb80, 0xca80,
-+						0x4b4d, 0x4b4d };
-+#endif
-+VECT_VAR_DECL(expected2,int,8,16) [] = { 0xf1, 0xf3, 0xf5, 0xf7,
-+					 0xf9, 0xfb, 0xfd, 0xff,
-+					 0x11, 0x11, 0x11, 0x11,
-+					 0x11, 0x11, 0x11, 0x11 };
-+VECT_VAR_DECL(expected2,int,16,8) [] = { 0xfff1, 0xfff3, 0xfff5, 0xfff7,
-+					 0x22, 0x22, 0x22, 0x22 };
-+VECT_VAR_DECL(expected2,int,32,4) [] = { 0xfffffff1, 0xfffffff3,
-+					 0x33, 0x33 };
-+VECT_VAR_DECL(expected2,int,64,2) [] = { 0xfffffffffffffff1,
-+					 0x44 };
-+VECT_VAR_DECL(expected2,uint,8,16) [] = { 0xf1, 0xf3, 0xf5, 0xf7,
-+					  0xf9, 0xfb, 0xfd, 0xff,
-+					  0x55, 0x55, 0x55, 0x55,
-+					  0x55, 0x55, 0x55, 0x55 };
-+VECT_VAR_DECL(expected2,uint,16,8) [] = { 0xfff1, 0xfff3, 0xfff5, 0xfff7,
-+					  0x66, 0x66, 0x66, 0x66 };
-+VECT_VAR_DECL(expected2,uint,32,4) [] = { 0xfffffff1, 0xfffffff3, 0x77, 0x77 };
-+VECT_VAR_DECL(expected2,uint,64,2) [] = { 0xfffffffffffffff1,
-+					  0x88 };
-+VECT_VAR_DECL(expected2,poly,8,16) [] = { 0xf1, 0xf3, 0xf5, 0xf7,
-+					  0xf9, 0xfb, 0xfd, 0xff,
-+					  0x55, 0x55, 0x55, 0x55,
-+					  0x55, 0x55, 0x55, 0x55 };
-+VECT_VAR_DECL(expected2,poly,16,8) [] = { 0xfff1, 0xfff3, 0xfff5, 0xfff7,
-+					  0x66, 0x66, 0x66, 0x66 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected2, hfloat, 16, 8) [] = { 0xcb80, 0xca80, 0xc980, 0xc880,
-+						0x4b4d, 0x4b4d, 0x4b4d, 0x4b4d
-+					      };
-+#endif
-+VECT_VAR_DECL(expected2,hfloat,32,4) [] = { 0xc1700000, 0xc1500000,
-+					    0x42073333, 0x42073333 };
-+
-+  clean_results ();
-+  CLEAN(expected2, int, 64, 1);
-+  CLEAN(expected2, uint, 64, 1);
-+
-+  TEST_VUZP2(, int, s, 8, 8);
-+  TEST_VUZP2(, int, s, 16, 4);
-+  TEST_VUZP2(, int, s, 32, 2);
-+  TEST_VUZP2(, uint, u, 8, 8);
-+  TEST_VUZP2(, uint, u, 16, 4);
-+  TEST_VUZP2(, uint, u, 32, 2);
-+  TEST_VUZP2(, poly, p, 8, 8);
-+  TEST_VUZP2(, poly, p, 16, 4);
-+#if defined (FP16_SUPPORTED)
-+  TEST_VUZP2(, float, f, 16, 4);
-+#endif
-+  TEST_VUZP2(, float, f, 32, 2);
-+
-+  TEST_VUZP2(q, int, s, 8, 16);
-+  TEST_VUZP2(q, int, s, 16, 8);
-+  TEST_VUZP2(q, int, s, 32, 4);
-+  TEST_VUZP2(q, int, s, 64, 2);
-+  TEST_VUZP2(q, uint, u, 8, 16);
-+  TEST_VUZP2(q, uint, u, 16, 8);
-+  TEST_VUZP2(q, uint, u, 32, 4);
-+  TEST_VUZP2(q, uint, u, 64, 2);
-+  TEST_VUZP2(q, poly, p, 8, 16);
-+  TEST_VUZP2(q, poly, p, 16, 8);
-+#if defined (FP16_SUPPORTED)
-+  TEST_VUZP2(q, float, f, 16, 8);
-+#endif
-+  TEST_VUZP2(q, float, f, 32, 4);
-+  TEST_VUZP2(q, float, f, 64, 2);
-+
-+  CHECK_RESULTS_NAMED (TEST_MSG, expected2, "");
-+#if defined (FP16_SUPPORTED)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected2, "");
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected2, "");
-+#endif
-+}
-+
-+int main (void)
-+{
-+  exec_vuzp_half ();
-+  return 0;
-+}
---- a/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vzip.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vzip.c
-@@ -18,6 +18,10 @@ VECT_VAR_DECL(expected0,poly,8,8) [] = { 0xf0, 0xf4, 0x55, 0x55,
- 					 0xf1, 0xf5, 0x55, 0x55 };
- VECT_VAR_DECL(expected0,poly,16,4) [] = { 0xfff0, 0xfff2,
- 					  0x66, 0x66 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected0, hfloat, 16, 4) [] = { 0xcc00, 0xcb00,
-+						0x4b4d, 0x4b4d };
-+#endif
- VECT_VAR_DECL(expected0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
- VECT_VAR_DECL(expected0,int,8,16) [] = { 0xf0, 0xf8, 0x11, 0x11,
- 					 0xf1, 0xf9, 0x11, 0x11,
-@@ -41,6 +45,12 @@ VECT_VAR_DECL(expected0,poly,8,16) [] = { 0xf0, 0xf8, 0x55, 0x55,
- 					  0xf3, 0xfb, 0x55, 0x55 };
- VECT_VAR_DECL(expected0,poly,16,8) [] = { 0xfff0, 0xfff4, 0x66, 0x66,
- 					  0xfff1, 0xfff5, 0x66, 0x66 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected0, hfloat, 16, 8) [] = { 0xcc00, 0xca00,
-+						0x4b4d, 0x4b4d,
-+						0xcb80, 0xc980,
-+						0x4b4d, 0x4b4d };
-+#endif
- VECT_VAR_DECL(expected0,hfloat,32,4) [] = { 0xc1800000, 0xc1600000,
- 					    0x42073333, 0x42073333 };
- 
-@@ -59,6 +69,10 @@ VECT_VAR_DECL(expected1,poly,8,8) [] = { 0xf2, 0xf6, 0x55, 0x55,
- 					 0xf3, 0xf7, 0x55, 0x55 };
- VECT_VAR_DECL(expected1,poly,16,4) [] = { 0xfff1, 0xfff3,
- 					  0x66, 0x66 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected1, hfloat, 16, 4) [] = { 0xcb80, 0xca80,
-+						0x4b4d, 0x4b4d };
-+#endif
- VECT_VAR_DECL(expected1,hfloat,32,2) [] = { 0x42066666, 0x42066666 };
- VECT_VAR_DECL(expected1,int,8,16) [] = { 0xf4, 0xfc, 0x11, 0x11,
- 					 0xf5, 0xfd, 0x11, 0x11,
-@@ -82,6 +96,12 @@ VECT_VAR_DECL(expected1,poly,8,16) [] = { 0xf4, 0xfc, 0x55, 0x55,
- 					  0xf7, 0xff, 0x55, 0x55 };
- VECT_VAR_DECL(expected1,poly,16,8) [] = { 0xfff2, 0xfff6, 0x66, 0x66,
- 					  0xfff3, 0xfff7, 0x66, 0x66 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected1, hfloat, 16, 8) [] = { 0xcb00, 0xc900,
-+						0x4b4d, 0x4b4d,
-+						0xca80, 0xc880,
-+						0x4b4d, 0x4b4d };
-+#endif
- VECT_VAR_DECL(expected1,hfloat,32,4) [] = { 0xc1700000, 0xc1500000,
- 					    0x42073333, 0x42073333 };
- 
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vzip_half.c
-@@ -0,0 +1,263 @@
-+/* { dg-do run } */
-+/* { dg-skip-if "" { arm*-*-* } } */
-+
-+#include <arm_neon.h>
-+#include "arm-neon-ref.h"
-+#include "compute-ref-data.h"
-+
-+/* Expected results.  */
-+VECT_VAR_DECL(expected,int,8,8) [] = { 0xf0, 0x11, 0xf1, 0x11,
-+				       0xf2, 0x11, 0xf3, 0x11 };
-+VECT_VAR_DECL(expected,int,16,4) [] = { 0xfff0, 0x22, 0xfff1, 0x22 };
-+VECT_VAR_DECL(expected,int,32,2) [] = { 0xfffffff0, 0x33 };
-+VECT_VAR_DECL(expected,int,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(expected,uint,8,8) [] = { 0xf0, 0x55, 0xf1, 0x55,
-+					0xf2, 0x55, 0xf3, 0x55 };
-+VECT_VAR_DECL(expected,uint,16,4) [] = { 0xfff0, 0x66, 0xfff1, 0x66 };
-+VECT_VAR_DECL(expected,uint,32,2) [] = { 0xfffffff0, 0x77 };
-+VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
-+VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf0, 0x55, 0xf1, 0x55,
-+					0xf2, 0x55, 0xf3, 0x55 };
-+VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff0, 0x66, 0xfff1, 0x66 };
-+VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800000, 0x42066666 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected, hfloat, 16, 4) [] = { 0xcc00, 0x4b4d,
-+					       0xcb80, 0x4b4d };
-+#endif
-+VECT_VAR_DECL(expected,int,8,16) [] = { 0xf0, 0x11, 0xf1, 0x11,
-+					0xf2, 0x11, 0xf3, 0x11,
-+					0xf4, 0x11, 0xf5, 0x11,
-+					0xf6, 0x11, 0xf7, 0x11 };
-+VECT_VAR_DECL(expected,int,16,8) [] = { 0xfff0, 0x22, 0xfff1, 0x22,
-+					0xfff2, 0x22, 0xfff3, 0x22 };
-+VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffffff0, 0x33,
-+					0xfffffff1, 0x33 };
-+VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffffff0,
-+					0x44 };
-+VECT_VAR_DECL(expected,uint,8,16) [] = { 0xf0, 0x55, 0xf1, 0x55,
-+					 0xf2, 0x55, 0xf3, 0x55,
-+					 0xf4, 0x55, 0xf5, 0x55,
-+					 0xf6, 0x55, 0xf7, 0x55 };
-+VECT_VAR_DECL(expected,uint,16,8) [] = { 0xfff0, 0x66, 0xfff1, 0x66,
-+					 0xfff2, 0x66, 0xfff3, 0x66 };
-+VECT_VAR_DECL(expected,uint,32,4) [] = { 0xfffffff0, 0x77,
-+					 0xfffffff1, 0x77 };
-+VECT_VAR_DECL(expected,uint,64,2) [] = { 0xfffffffffffffff0,
-+					 0x88 };
-+VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf0, 0x55, 0xf1, 0x55,
-+					 0xf2, 0x55, 0xf3, 0x55,
-+					 0xf4, 0x55, 0xf5, 0x55,
-+					 0xf6, 0x55, 0xf7, 0x55 };
-+VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff0, 0x66, 0xfff1, 0x66,
-+					 0xfff2, 0x66, 0xfff3, 0x66 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected, hfloat, 16, 8) [] = { 0xcc00, 0x4b4d,
-+					       0xcb80, 0x4b4d,
-+					       0xcb00, 0x4b4d,
-+					       0xca80, 0x4b4d };
-+#endif
-+VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1800000, 0x42073333,
-+					   0xc1700000, 0x42073333 };
-+
-+#define TEST_MSG "VZIP1"
-+void exec_vzip_half (void)
-+{
-+#define TEST_VZIP(PART, Q, T1, T2, W, N)		\
-+  VECT_VAR(vector_res, T1, W, N) =			\
-+    vzip##PART##Q##_##T2##W(VECT_VAR(vector, T1, W, N),	\
-+		       VECT_VAR(vector2, T1, W, N));	\
-+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res, T1, W, N))
-+
-+#define TEST_VZIP1(Q, T1, T2, W, N) TEST_VZIP(1, Q, T1, T2, W, N)
-+
-+  /* Input vector can only have 64 bits.  */
-+  DECL_VARIABLE_ALL_VARIANTS(vector);
-+  DECL_VARIABLE_ALL_VARIANTS(vector2);
-+  DECL_VARIABLE(vector, float, 64, 2);
-+  DECL_VARIABLE(vector2, float, 64, 2);
-+
-+  DECL_VARIABLE_ALL_VARIANTS(vector_res);
-+  DECL_VARIABLE(vector_res, float, 64, 2);
-+
-+  clean_results ();
-+  /* We don't have vzip1_T64x1, so set expected to the clean value.  */
-+  CLEAN(expected, int, 64, 1);
-+  CLEAN(expected, uint, 64, 1);
-+
-+  TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
-+#if defined (FP16_SUPPORTED)
-+  VLOAD(vector, buffer, , float, f, 16, 4);
-+  VLOAD(vector, buffer, q, float, f, 16, 8);
-+#endif
-+  VLOAD(vector, buffer, , float, f, 32, 2);
-+  VLOAD(vector, buffer, q, float, f, 32, 4);
-+  VLOAD(vector, buffer, q, float, f, 64, 2);
-+
-+  /* Choose arbitrary initialization values.  */
-+  VDUP(vector2, , int, s, 8, 8, 0x11);
-+  VDUP(vector2, , int, s, 16, 4, 0x22);
-+  VDUP(vector2, , int, s, 32, 2, 0x33);
-+  VDUP(vector2, , uint, u, 8, 8, 0x55);
-+  VDUP(vector2, , uint, u, 16, 4, 0x66);
-+  VDUP(vector2, , uint, u, 32, 2, 0x77);
-+  VDUP(vector2, , poly, p, 8, 8, 0x55);
-+  VDUP(vector2, , poly, p, 16, 4, 0x66);
-+#if defined (FP16_SUPPORTED)
-+  VDUP (vector2, , float, f, 16, 4, 14.6f);   /* 14.6f is 0x4b4d.  */
-+#endif
-+  VDUP(vector2, , float, f, 32, 2, 33.6f);
-+
-+  VDUP(vector2, q, int, s, 8, 16, 0x11);
-+  VDUP(vector2, q, int, s, 16, 8, 0x22);
-+  VDUP(vector2, q, int, s, 32, 4, 0x33);
-+  VDUP(vector2, q, int, s, 64, 2, 0x44);
-+  VDUP(vector2, q, uint, u, 8, 16, 0x55);
-+  VDUP(vector2, q, uint, u, 16, 8, 0x66);
-+  VDUP(vector2, q, uint, u, 32, 4, 0x77);
-+  VDUP(vector2, q, uint, u, 64, 2, 0x88);
-+  VDUP(vector2, q, poly, p, 8, 16, 0x55);
-+  VDUP(vector2, q, poly, p, 16, 8, 0x66);
-+#if defined (FP16_SUPPORTED)
-+  VDUP (vector2, q, float, f, 16, 8, 14.6f);
-+#endif
-+  VDUP(vector2, q, float, f, 32, 4, 33.8f);
-+  VDUP(vector2, q, float, f, 64, 2, 33.8f);
-+
-+  TEST_VZIP1(, int, s, 8, 8);
-+  TEST_VZIP1(, int, s, 16, 4);
-+  TEST_VZIP1(, int, s, 32, 2);
-+  TEST_VZIP1(, uint, u, 8, 8);
-+  TEST_VZIP1(, uint, u, 16, 4);
-+  TEST_VZIP1(, uint, u, 32, 2);
-+  TEST_VZIP1(, poly, p, 8, 8);
-+  TEST_VZIP1(, poly, p, 16, 4);
-+#if defined (FP16_SUPPORTED)
-+  TEST_VZIP1(, float, f, 16, 4);
-+#endif
-+  TEST_VZIP1(, float, f, 32, 2);
-+
-+  TEST_VZIP1(q, int, s, 8, 16);
-+  TEST_VZIP1(q, int, s, 16, 8);
-+  TEST_VZIP1(q, int, s, 32, 4);
-+  TEST_VZIP1(q, int, s, 64, 2);
-+  TEST_VZIP1(q, uint, u, 8, 16);
-+  TEST_VZIP1(q, uint, u, 16, 8);
-+  TEST_VZIP1(q, uint, u, 32, 4);
-+  TEST_VZIP1(q, uint, u, 64, 2);
-+  TEST_VZIP1(q, poly, p, 8, 16);
-+  TEST_VZIP1(q, poly, p, 16, 8);
-+#if defined (FP16_SUPPORTED)
-+  TEST_VZIP1(q, float, f, 16, 8);
-+#endif
-+  TEST_VZIP1(q, float, f, 32, 4);
-+  TEST_VZIP1(q, float, f, 64, 2);
-+
-+#if defined (FP16_SUPPORTED)
-+  CHECK_RESULTS (TEST_MSG, "");
-+#else
-+  CHECK_RESULTS_NO_FP16 (TEST_MSG, "");
-+#endif
-+
-+#undef TEST_MSG
-+#define TEST_MSG "VZIP2"
-+
-+#define TEST_VZIP2(Q, T1, T2, W, N) TEST_VZIP(2, Q, T1, T2, W, N)
-+
-+/* Expected results.  */
-+VECT_VAR_DECL(expected2,int,8,8) [] = { 0xf4, 0x11, 0xf5, 0x11,
-+					0xf6, 0x11, 0xf7, 0x11 };
-+VECT_VAR_DECL(expected2,int,16,4) [] = { 0xfff2, 0x22, 0xfff3, 0x22 };
-+VECT_VAR_DECL(expected2,int,32,2) [] = { 0xfffffff1, 0x33 };
-+VECT_VAR_DECL(expected2,int,64,1) [] = { 0xfffffffffffffff1 };
-+VECT_VAR_DECL(expected2,uint,8,8) [] = { 0xf4, 0x55, 0xf5, 0x55,
-+					 0xf6, 0x55, 0xf7, 0x55 };
-+VECT_VAR_DECL(expected2,uint,16,4) [] = { 0xfff2, 0x66, 0xfff3, 0x66 };
-+VECT_VAR_DECL(expected2,uint,32,2) [] = { 0xfffffff1, 0x77 };
-+VECT_VAR_DECL(expected2,uint,64,1) [] = { 0xfffffffffffffff1 };
-+VECT_VAR_DECL(expected2,poly,8,8) [] = { 0xf4, 0x55, 0xf5, 0x55,
-+					 0xf6, 0x55, 0xf7, 0x55 };
-+VECT_VAR_DECL(expected2,poly,16,4) [] = { 0xfff2, 0x66, 0xfff3, 0x66 };
-+VECT_VAR_DECL(expected2,hfloat,32,2) [] = { 0xc1700000, 0x42066666 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected2, hfloat, 16, 4) [] = { 0xcb00, 0x4b4d,
-+						0xca80, 0x4b4d };
-+#endif
-+VECT_VAR_DECL(expected2,int,8,16) [] = { 0xf8, 0x11, 0xf9, 0x11,
-+					 0xfa, 0x11, 0xfb, 0x11,
-+					 0xfc, 0x11, 0xfd, 0x11,
-+					 0xfe, 0x11, 0xff, 0x11 };
-+VECT_VAR_DECL(expected2,int,16,8) [] = { 0xfff4, 0x22, 0xfff5, 0x22,
-+					 0xfff6, 0x22, 0xfff7, 0x22 };
-+VECT_VAR_DECL(expected2,int,32,4) [] = { 0xfffffff2, 0x33,
-+					 0xfffffff3, 0x33 };
-+VECT_VAR_DECL(expected2,int,64,2) [] = { 0xfffffffffffffff1,
-+					 0x44 };
-+VECT_VAR_DECL(expected2,uint,8,16) [] = { 0xf8, 0x55, 0xf9, 0x55,
-+					  0xfa, 0x55, 0xfb, 0x55,
-+					  0xfc, 0x55, 0xfd, 0x55,
-+					  0xfe, 0x55, 0xff, 0x55 };
-+VECT_VAR_DECL(expected2,uint,16,8) [] = { 0xfff4, 0x66, 0xfff5, 0x66,
-+					  0xfff6, 0x66, 0xfff7, 0x66 };
-+VECT_VAR_DECL(expected2,uint,32,4) [] = { 0xfffffff2, 0x77,
-+					  0xfffffff3, 0x77 };
-+VECT_VAR_DECL(expected2,uint,64,2) [] = { 0xfffffffffffffff1,
-+					  0x88 };
-+VECT_VAR_DECL(expected2,poly,8,16) [] = { 0xf8, 0x55, 0xf9, 0x55,
-+					  0xfa, 0x55, 0xfb, 0x55,
-+					  0xfc, 0x55, 0xfd, 0x55,
-+					  0xfe, 0x55, 0xff, 0x55 };
-+VECT_VAR_DECL(expected2,poly,16,8) [] = { 0xfff4, 0x66, 0xfff5, 0x66,
-+					  0xfff6, 0x66, 0xfff7, 0x66 };
-+#if defined (FP16_SUPPORTED)
-+VECT_VAR_DECL (expected2, hfloat, 16, 8) [] = { 0xca00, 0x4b4d,
-+						0xc980, 0x4b4d,
-+						0xc900, 0x4b4d,
-+						0xc880, 0x4b4d };
-+#endif
-+VECT_VAR_DECL(expected2,hfloat,32,4) [] = { 0xc1600000, 0x42073333,
-+					    0xc1500000, 0x42073333 };
-+  clean_results ();
-+  CLEAN(expected2, int, 64, 1);
-+  CLEAN(expected2, uint, 64, 1);
-+
-+  TEST_VZIP2(, int, s, 8, 8);
-+  TEST_VZIP2(, int, s, 16, 4);
-+  TEST_VZIP2(, int, s, 32, 2);
-+  TEST_VZIP2(, uint, u, 8, 8);
-+  TEST_VZIP2(, uint, u, 16, 4);
-+  TEST_VZIP2(, uint, u, 32, 2);
-+  TEST_VZIP2(, poly, p, 8, 8);
-+  TEST_VZIP2(, poly, p, 16, 4);
-+#if defined (FP16_SUPPORTED)
-+  TEST_VZIP2(, float, f, 16, 4);
-+#endif
-+  TEST_VZIP2(, float, f, 32, 2);
-+
-+  TEST_VZIP2(q, int, s, 8, 16);
-+  TEST_VZIP2(q, int, s, 16, 8);
-+  TEST_VZIP2(q, int, s, 32, 4);
-+  TEST_VZIP2(q, int, s, 64, 2);
-+  TEST_VZIP2(q, uint, u, 8, 16);
-+  TEST_VZIP2(q, uint, u, 16, 8);
-+  TEST_VZIP2(q, uint, u, 32, 4);
-+  TEST_VZIP2(q, uint, u, 64, 2);
-+  TEST_VZIP2(q, poly, p, 8, 16);
-+  TEST_VZIP2(q, poly, p, 16, 8);
-+#if defined (FP16_SUPPORTED)
-+  TEST_VZIP2(q, float, f, 16, 8);
-+#endif
-+  TEST_VZIP2(q, float, f, 32, 4);
-+  TEST_VZIP2(q, float, f, 64, 2);
-+
-+  CHECK_RESULTS_NAMED (TEST_MSG, expected2, "");
-+#if defined (FP16_SUPPORTED)
-+  CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected2, "");
-+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected2, "");
-+#endif
-+}
-+
-+int main (void)
-+{
-+  exec_vzip_half ();
-+  return 0;
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/ands_3.c
-@@ -0,0 +1,12 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2" } */
-+
-+int
-+f9 (unsigned char x, int y)
-+{
-+  if (y > 1 && x == 0)
-+    return 10;
-+  return x;
-+}
-+
-+/* { dg-final { scan-assembler "ands\t(x|w)\[0-9\]+,\[ \t\]*(x|w)\[0-9\]+,\[ \t\]*255" } } */
---- a/src/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-1.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-1.c
-@@ -1,4 +1,5 @@
- /* { dg-error "unknown" "" {target "aarch64*-*-*" } } */
-+/* { dg-skip-if "do not override -mcpu" { *-*-* } { "-mcpu=*" } { "" } } */
- /* { dg-options "-O2 -mcpu=dummy" } */
- 
- void f ()
---- a/src/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-2.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-2.c
-@@ -1,4 +1,5 @@
- /* { dg-error "missing" "" {target "aarch64*-*-*" } } */
-+/* { dg-skip-if "do not override -mcpu" { *-*-* } { "-mcpu=*" } { "" } } */
- /* { dg-options "-O2 -mcpu=cortex-a53+no" } */
- 
- void f ()
---- a/src/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-3.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-3.c
-@@ -1,4 +1,5 @@
- /* { dg-error "invalid feature" "" {target "aarch64*-*-*" } } */
-+/* { dg-skip-if "do not override -mcpu" { *-*-* } { "-mcpu=*" } { "" } } */
- /* { dg-options "-O2 -mcpu=cortex-a53+dummy" } */
- 
- void f ()
---- a/src/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-4.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-4.c
-@@ -1,4 +1,5 @@
- /* { dg-error "missing" "" {target "aarch64*-*-*" } } */
-+/* { dg-skip-if "do not override -mcpu" { *-*-* } { "-mcpu=*" } { "" } } */
- /* { dg-options "-O2 -mcpu=+dummy" } */
- 
- void f ()
---- a/src/gcc/testsuite/gcc.target/aarch64/fmaxmin.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/fmaxmin.c
-@@ -1,5 +1,5 @@
- /* { dg-do run } */
--/* { dg-options "-O2 -ftree-vectorize -fno-inline -save-temps" } */
-+/* { dg-options "-O2 -ftree-vectorize -fno-inline -fno-vect-cost-model -save-temps" } */
- 
- 
- extern void abort (void);
---- a/src/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
-@@ -110,6 +110,6 @@ main (int argc, char **argv)
- /* vfmaq_lane_f64.
-    vfma_laneq_f64.
-    vfmaq_laneq_f64.  */
--/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
-+/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */
- 
- 
---- a/src/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
-@@ -111,6 +111,6 @@ main (int argc, char **argv)
- /* vfmsq_lane_f64.
-    vfms_laneq_f64.
-    vfmsq_laneq_f64.  */
--/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
-+/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */
- 
- 
---- a/src/gcc/testsuite/gcc.target/aarch64/fmovd-zero-reg.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/fmovd-zero-reg.c
-@@ -8,4 +8,4 @@ foo (void)
-   bar (0.0);
- }
- 
--/* { dg-final { scan-assembler "fmov\\td0, xzr" } } */
-+/* { dg-final { scan-assembler "movi\\td0, #0" } } */
---- a/src/gcc/testsuite/gcc.target/aarch64/fmovf-zero-reg.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/fmovf-zero-reg.c
-@@ -8,4 +8,4 @@ foo (void)
-   bar (0.0);
- }
- 
--/* { dg-final { scan-assembler "fmov\\ts0, wzr" } } */
-+/* { dg-final { scan-assembler "movi\\tv0\.2s, #0" } } */
---- a/src/gcc/testsuite/gcc.target/aarch64/fmul_fcvt_2.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/fmul_fcvt_2.c
-@@ -1,5 +1,5 @@
- /* { dg-do run } */
--/* { dg-options "-save-temps -O2 -ftree-vectorize -fno-inline" } */
-+/* { dg-options "-save-temps -O2 -ftree-vectorize -fno-inline -fno-vect-cost-model" } */
- 
- #define N 1024
- 
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/ifcvt_multiple_sets_subreg_1.c
-@@ -0,0 +1,30 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2 -fdump-rtl-ce1" } */
-+
-+/* Check that the inner if is transformed into CSELs.  */
-+
-+int
-+foo (int *x, int *z, int a)
-+{
-+  int b = 0;
-+  int c = 0;
-+  int d = 0;
-+  int i;
-+
-+  for (i = 0; i < a; i++)
-+    {
-+      if (x[i] < c)
-+	{
-+	  b = z[i];
-+	  if (c < b)
-+	    {
-+	      c = b;
-+	      d = i;
-+	    }
-+	}
-+    }
-+
-+  return c + d;
-+}
-+
-+/* { dg-final { scan-rtl-dump "if-conversion succeeded through noce_convert_multiple_sets" "ce1" } } */
---- a/src/gcc/testsuite/gcc.target/aarch64/ldp_stp_1.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/ldp_stp_1.c
-@@ -1,4 +1,4 @@
--/* { dg-options "-O2" } */
-+/* { dg-options "-O2 -mcpu=generic" } */
- 
- int arr[4][4];
- 
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/ldp_stp_unaligned_1.c
-@@ -0,0 +1,20 @@
-+/* { dg-options "-O2" } */
-+
-+/* Check that we can use a REG + IMM addressing mode when moving an unaligned
-+   TImode value to and from memory.  */
-+
-+struct foo
-+{
-+  long long b;
-+  __int128 a;
-+} __attribute__ ((packed));
-+
-+void
-+bar (struct foo *p, struct foo *q)
-+{
-+  p->a = q->a;
-+}
-+
-+/* { dg-final { scan-assembler-not "add\tx\[0-9\]+, x\[0-9\]+" } } */
-+/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\], .*8" 1 } } */
-+/* { dg-final { scan-assembler-times "stp\tx\[0-9\]+, x\[0-9\], .*8" 1 } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/popcnt.c
-@@ -0,0 +1,23 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2" } */
-+
-+int
-+foo (int x)
-+{
-+  return __builtin_popcount (x);
-+}
-+
-+long
-+foo1 (long x)
-+{
-+  return __builtin_popcountl (x);
-+}
-+
-+long long
-+foo2 (long long x)
-+{
-+  return __builtin_popcountll (x);
-+}
-+
-+/* { dg-final { scan-assembler-not "popcount" } } */
-+/* { dg-final { scan-assembler-times "cnt\t" 3 } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/pr37780_1.c
-@@ -0,0 +1,46 @@
-+/* Test that we can remove the conditional move due to CLZ
-+   and CTZ being defined at zero.  */
-+
-+/* { dg-do compile } */
-+/* { dg-options "-O2" } */
-+
-+int
-+fooctz (int i)
-+{
-+  return (i == 0) ? 32 : __builtin_ctz (i);
-+}
-+
-+int
-+fooctz2 (int i)
-+{
-+  return (i != 0) ? __builtin_ctz (i) : 32;
-+}
-+
-+unsigned int
-+fooctz3 (unsigned int i)
-+{
-+  return (i > 0) ?  __builtin_ctz (i) : 32;
-+}
-+
-+/* { dg-final { scan-assembler-times "rbit\t*" 3 } } */
-+
-+int
-+fooclz (int i)
-+{
-+  return (i == 0) ? 32 : __builtin_clz (i);
-+}
-+
-+int
-+fooclz2 (int i)
-+{
-+  return (i != 0) ? __builtin_clz (i) : 32;
-+}
-+
-+unsigned int
-+fooclz3 (unsigned int i)
-+{
-+  return (i > 0) ? __builtin_clz (i) : 32;
-+}
-+
-+/* { dg-final { scan-assembler-times "clz\t" 6 } } */
-+/* { dg-final { scan-assembler-not "cmp\t.*0" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/pr63874.c
-@@ -0,0 +1,22 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2" } */
-+/* { dg-skip-if "Not applicable for mcmodel=large" { aarch64*-*-* }  { "-mcmodel=large" } { "" } } */
-+
-+extern void __attribute__((weak)) foo_weakref (void);
-+void __attribute__((weak, noinline)) bar (void)
-+{
-+ return;
-+}
-+void (*f) (void);
-+void (*g) (void);
-+
-+int
-+main (void)
-+{
-+ f = &foo_weakref;
-+ g = &bar;
-+ return 0;
-+}
-+
-+/* { dg-final { scan-assembler-not "adr*foo_weakref" } } */
-+/* { dg-final { scan-assembler-not "\\.(word|xword)\tbar" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/pr71727.c
-@@ -0,0 +1,33 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mstrict-align -O3" } */
-+
-+struct test_struct_s
-+{
-+  long a;
-+  long b;
-+  long c;
-+  long d;
-+  unsigned long e;
-+};
-+
-+
-+char _a;
-+struct test_struct_s xarray[128];
-+
-+void
-+_start (void)
-+{
-+  struct test_struct_s *new_entry;
-+
-+  new_entry = &xarray[0];
-+  new_entry->a = 1;
-+  new_entry->b = 2;
-+  new_entry->c = 3;
-+  new_entry->d = 4;
-+  new_entry->e = 5;
-+
-+  return;
-+}
-+
-+/* { dg-final { scan-assembler-times "mov\tx" 5 {target lp64} } } */
-+/* { dg-final { scan-assembler-not "add\tx0, x0, :" {target lp64} } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/pr78382.c
-@@ -0,0 +1,10 @@
-+/* { dg-require-effective-target fpic } */
-+/* { dg-options "-mtls-dialect=trad -fpic" } */
-+
-+__thread int abc;
-+void
-+foo ()
-+{
-+  int *p;
-+  p = &abc;
-+}
---- a/src/gcc/testsuite/gcc.target/aarch64/simd/vminmaxnm_1.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/simd/vminmaxnm_1.c
-@@ -1,4 +1,4 @@
--/* Test the `v[min|max]nm{q}_f*' AArch64 SIMD intrinsic.  */
-+/* Test the `v[min|max]{nm}{q}_f*' AArch64 SIMD intrinsic.  */
- 
- /* { dg-do run } */
- /* { dg-options "-O2" } */
-@@ -18,6 +18,7 @@ extern void abort ();
- int
- main (int argc, char **argv)
- {
-+  /* v{min|max}nm_f32 normal.  */
-   float32x2_t f32x2_input1 = vdup_n_f32 (-1.0);
-   float32x2_t f32x2_input2 = vdup_n_f32 (0.0);
-   float32x2_t f32x2_exp_minnm  = vdup_n_f32 (-1.0);
-@@ -28,6 +29,7 @@ main (int argc, char **argv)
-   CHECK (uint32_t, 2, f32x2_ret_minnm, f32x2_exp_minnm);
-   CHECK (uint32_t, 2, f32x2_ret_maxnm, f32x2_exp_maxnm);
- 
-+  /* v{min|max}nm_f32 NaN.  */
-   f32x2_input1 = vdup_n_f32 (__builtin_nanf (""));
-   f32x2_input2 = vdup_n_f32 (1.0);
-   f32x2_exp_minnm  = vdup_n_f32 (1.0);
-@@ -38,6 +40,7 @@ main (int argc, char **argv)
-   CHECK (uint32_t, 2, f32x2_ret_minnm, f32x2_exp_minnm);
-   CHECK (uint32_t, 2, f32x2_ret_maxnm, f32x2_exp_maxnm);
- 
-+  /* v{min|max}nmq_f32 normal.  */
-   float32x4_t f32x4_input1 = vdupq_n_f32 (-1024.0);
-   float32x4_t f32x4_input2 = vdupq_n_f32 (77.0);
-   float32x4_t f32x4_exp_minnm  = vdupq_n_f32 (-1024.0);
-@@ -48,6 +51,7 @@ main (int argc, char **argv)
-   CHECK (uint32_t, 4, f32x4_ret_minnm, f32x4_exp_minnm);
-   CHECK (uint32_t, 4, f32x4_ret_maxnm, f32x4_exp_maxnm);
- 
-+  /* v{min|max}nmq_f32 NaN.  */
-   f32x4_input1 = vdupq_n_f32 (-__builtin_nanf (""));
-   f32x4_input2 = vdupq_n_f32 (-1.0);
-   f32x4_exp_minnm  = vdupq_n_f32 (-1.0);
-@@ -58,16 +62,57 @@ main (int argc, char **argv)
-   CHECK (uint32_t, 4, f32x4_ret_minnm, f32x4_exp_minnm);
-   CHECK (uint32_t, 4, f32x4_ret_maxnm, f32x4_exp_maxnm);
- 
-+  /* v{min|max}nm_f64 normal.  */
-+  float64x1_t f64x1_input1 = vdup_n_f64 (1.23);
-+  float64x1_t f64x1_input2 = vdup_n_f64 (4.56);
-+  float64x1_t f64x1_exp_minnm  = vdup_n_f64 (1.23);
-+  float64x1_t f64x1_exp_maxnm  = vdup_n_f64 (4.56);
-+  float64x1_t f64x1_ret_minnm  = vminnm_f64 (f64x1_input1, f64x1_input2);
-+  float64x1_t f64x1_ret_maxnm  = vmaxnm_f64 (f64x1_input1, f64x1_input2);
-+  CHECK (uint64_t, 1, f64x1_ret_minnm, f64x1_exp_minnm);
-+  CHECK (uint64_t, 1, f64x1_ret_maxnm, f64x1_exp_maxnm);
-+
-+  /* v{min|max}_f64 normal.  */
-+  float64x1_t f64x1_exp_min  = vdup_n_f64 (1.23);
-+  float64x1_t f64x1_exp_max  = vdup_n_f64 (4.56);
-+  float64x1_t f64x1_ret_min  = vmin_f64 (f64x1_input1, f64x1_input2);
-+  float64x1_t f64x1_ret_max  = vmax_f64 (f64x1_input1, f64x1_input2);
-+  CHECK (uint64_t, 1, f64x1_ret_min, f64x1_exp_min);
-+  CHECK (uint64_t, 1, f64x1_ret_max, f64x1_exp_max);
-+
-+  /* v{min|max}nmq_f64 normal.  */
-   float64x2_t f64x2_input1 = vdupq_n_f64 (1.23);
-   float64x2_t f64x2_input2 = vdupq_n_f64 (4.56);
-   float64x2_t f64x2_exp_minnm  = vdupq_n_f64 (1.23);
-   float64x2_t f64x2_exp_maxnm  = vdupq_n_f64 (4.56);
-   float64x2_t f64x2_ret_minnm  = vminnmq_f64 (f64x2_input1, f64x2_input2);
-   float64x2_t f64x2_ret_maxnm  = vmaxnmq_f64 (f64x2_input1, f64x2_input2);
--
-   CHECK (uint64_t, 2, f64x2_ret_minnm, f64x2_exp_minnm);
-   CHECK (uint64_t, 2, f64x2_ret_maxnm, f64x2_exp_maxnm);
- 
-+  /* v{min|max}nm_f64 NaN.  */
-+  f64x1_input1 = vdup_n_f64 (-__builtin_nanf (""));
-+  f64x1_input2 = vdup_n_f64 (1.0);
-+  f64x1_exp_minnm  = vdup_n_f64 (1.0);
-+  f64x1_exp_maxnm  = vdup_n_f64 (1.0);
-+  f64x1_ret_minnm  = vminnm_f64 (f64x1_input1, f64x1_input2);
-+  f64x1_ret_maxnm  = vmaxnm_f64 (f64x1_input1, f64x1_input2);
-+
-+  CHECK (uint64_t, 1, f64x1_ret_minnm, f64x1_exp_minnm);
-+  CHECK (uint64_t, 1, f64x1_ret_maxnm, f64x1_exp_maxnm);
-+
-+  /* v{min|max}_f64 NaN.  */
-+  f64x1_input1 = vdup_n_f64 (-__builtin_nanf (""));
-+  f64x1_input2 = vdup_n_f64 (1.0);
-+  f64x1_exp_minnm  = vdup_n_f64 (-__builtin_nanf (""));
-+  f64x1_exp_maxnm  = vdup_n_f64 (-__builtin_nanf (""));
-+  f64x1_ret_minnm  = vmin_f64 (f64x1_input1, f64x1_input2);
-+  f64x1_ret_maxnm  = vmax_f64 (f64x1_input1, f64x1_input2);
-+
-+  CHECK (uint64_t, 1, f64x1_ret_minnm, f64x1_exp_minnm);
-+  CHECK (uint64_t, 1, f64x1_ret_maxnm, f64x1_exp_maxnm);
-+
-+  /* v{min|max}nmq_f64 NaN.  */
-   f64x2_input1 = vdupq_n_f64 (-__builtin_nan (""));
-   f64x2_input2 = vdupq_n_f64 (1.0);
-   f64x2_exp_minnm  = vdupq_n_f64 (1.0);
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c
-@@ -0,0 +1,541 @@
-+/* Test the vmul_n_f64 AArch64 SIMD intrinsic.  */
-+
-+/* { dg-do run } */
-+/* { dg-options "-O2 --save-temps" } */
-+
-+#include "arm_neon.h"
-+
-+extern void abort (void);
-+
-+#define A (132.4f)
-+#define B (-0.0f)
-+#define C (-34.8f)
-+#define D (289.34f)
-+float32_t expected2_1[2] = {A * A, B * A};
-+float32_t expected2_2[2] = {A * B, B * B};
-+float32_t expected4_1[4] = {A * A, B * A, C * A, D * A};
-+float32_t expected4_2[4] = {A * B, B * B, C * B, D * B};
-+float32_t expected4_3[4] = {A * C, B * C, C * C, D * C};
-+float32_t expected4_4[4] = {A * D, B * D, C * D, D * D};
-+float32_t _elemA = A;
-+float32_t _elemB = B;
-+float32_t _elemC = C;
-+float32_t _elemD = D;
-+
-+#define AD (1234.5)
-+#define BD (-0.0)
-+#define CD (71.3)
-+#define DD (-1024.4)
-+float64_t expectedd2_1[2] = {AD * CD, BD * CD};
-+float64_t expectedd2_2[2] = {AD * DD, BD * DD};
-+float64_t _elemdC = CD;
-+float64_t _elemdD = DD;
-+
-+
-+#define AS (1024)
-+#define BS (-31)
-+#define CS (0)
-+#define DS (655)
-+int32_t expecteds2_1[2] = {AS * AS, BS * AS};
-+int32_t expecteds2_2[2] = {AS * BS, BS * BS};
-+int32_t expecteds4_1[4] = {AS * AS, BS * AS, CS * AS, DS * AS};
-+int32_t expecteds4_2[4] = {AS * BS, BS * BS, CS * BS, DS * BS};
-+int32_t expecteds4_3[4] = {AS * CS, BS * CS, CS * CS, DS * CS};
-+int32_t expecteds4_4[4] = {AS * DS, BS * DS, CS * DS, DS * DS};
-+int32_t _elemsA = AS;
-+int32_t _elemsB = BS;
-+int32_t _elemsC = CS;
-+int32_t _elemsD = DS;
-+
-+#define AH ((int16_t) 0)
-+#define BH ((int16_t) -32)
-+#define CH ((int16_t) 102)
-+#define DH ((int16_t) -51)
-+#define EH ((int16_t) 71)
-+#define FH ((int16_t) -91)
-+#define GH ((int16_t) 48)
-+#define HH ((int16_t) 255)
-+int16_t expectedh4_1[4] = {AH * AH, BH * AH, CH * AH, DH * AH};
-+int16_t expectedh4_2[4] = {AH * BH, BH * BH, CH * BH, DH * BH};
-+int16_t expectedh4_3[4] = {AH * CH, BH * CH, CH * CH, DH * CH};
-+int16_t expectedh4_4[4] = {AH * DH, BH * DH, CH * DH, DH * DH};
-+int16_t expectedh8_1[8] = {AH * AH, BH * AH, CH * AH, DH * AH,
-+			   EH * AH, FH * AH, GH * AH, HH * AH};
-+int16_t expectedh8_2[8] = {AH * BH, BH * BH, CH * BH, DH * BH,
-+			   EH * BH, FH * BH, GH * BH, HH * BH};
-+int16_t expectedh8_3[8] = {AH * CH, BH * CH, CH * CH, DH * CH,
-+			   EH * CH, FH * CH, GH * CH, HH * CH};
-+int16_t expectedh8_4[8] = {AH * DH, BH * DH, CH * DH, DH * DH,
-+			   EH * DH, FH * DH, GH * DH, HH * DH};
-+int16_t expectedh8_5[8] = {AH * EH, BH * EH, CH * EH, DH * EH,
-+			   EH * EH, FH * EH, GH * EH, HH * EH};
-+int16_t expectedh8_6[8] = {AH * FH, BH * FH, CH * FH, DH * FH,
-+			   EH * FH, FH * FH, GH * FH, HH * FH};
-+int16_t expectedh8_7[8] = {AH * GH, BH * GH, CH * GH, DH * GH,
-+			   EH * GH, FH * GH, GH * GH, HH * GH};
-+int16_t expectedh8_8[8] = {AH * HH, BH * HH, CH * HH, DH * HH,
-+			   EH * HH, FH * HH, GH * HH, HH * HH};
-+int16_t _elemhA = AH;
-+int16_t _elemhB = BH;
-+int16_t _elemhC = CH;
-+int16_t _elemhD = DH;
-+int16_t _elemhE = EH;
-+int16_t _elemhF = FH;
-+int16_t _elemhG = GH;
-+int16_t _elemhH = HH;
-+
-+#define AUS (1024)
-+#define BUS (31)
-+#define CUS (0)
-+#define DUS (655)
-+uint32_t expectedus2_1[2] = {AUS * AUS, BUS * AUS};
-+uint32_t expectedus2_2[2] = {AUS * BUS, BUS * BUS};
-+uint32_t expectedus4_1[4] = {AUS * AUS, BUS * AUS, CUS * AUS, DUS * AUS};
-+uint32_t expectedus4_2[4] = {AUS * BUS, BUS * BUS, CUS * BUS, DUS * BUS};
-+uint32_t expectedus4_3[4] = {AUS * CUS, BUS * CUS, CUS * CUS, DUS * CUS};
-+uint32_t expectedus4_4[4] = {AUS * DUS, BUS * DUS, CUS * DUS, DUS * DUS};
-+uint32_t _elemusA = AUS;
-+uint32_t _elemusB = BUS;
-+uint32_t _elemusC = CUS;
-+uint32_t _elemusD = DUS;
-+
-+#define AUH ((uint16_t) 0)
-+#define BUH ((uint16_t) 32)
-+#define CUH ((uint16_t) 102)
-+#define DUH ((uint16_t) 51)
-+#define EUH ((uint16_t) 71)
-+#define FUH ((uint16_t) 91)
-+#define GUH ((uint16_t) 48)
-+#define HUH ((uint16_t) 255)
-+uint16_t expecteduh4_1[4] = {AUH * AUH, BUH * AUH, CUH * AUH, DUH * AUH};
-+uint16_t expecteduh4_2[4] = {AUH * BUH, BUH * BUH, CUH * BUH, DUH * BUH};
-+uint16_t expecteduh4_3[4] = {AUH * CUH, BUH * CUH, CUH * CUH, DUH * CUH};
-+uint16_t expecteduh4_4[4] = {AUH * DUH, BUH * DUH, CUH * DUH, DUH * DUH};
-+uint16_t expecteduh8_1[8] = {AUH * AUH, BUH * AUH, CUH * AUH, DUH * AUH,
-+			     EUH * AUH, FUH * AUH, GUH * AUH, HUH * AUH};
-+uint16_t expecteduh8_2[8] = {AUH * BUH, BUH * BUH, CUH * BUH, DUH * BUH,
-+			     EUH * BUH, FUH * BUH, GUH * BUH, HUH * BUH};
-+uint16_t expecteduh8_3[8] = {AUH * CUH, BUH * CUH, CUH * CUH, DUH * CUH,
-+			     EUH * CUH, FUH * CUH, GUH * CUH, HUH * CUH};
-+uint16_t expecteduh8_4[8] = {AUH * DUH, BUH * DUH, CUH * DUH, DUH * DUH,
-+			     EUH * DUH, FUH * DUH, GUH * DUH, HUH * DUH};
-+uint16_t expecteduh8_5[8] = {AUH * EUH, BUH * EUH, CUH * EUH, DUH * EUH,
-+			     EUH * EUH, FUH * EUH, GUH * EUH, HUH * EUH};
-+uint16_t expecteduh8_6[8] = {AUH * FUH, BUH * FUH, CUH * FUH, DUH * FUH,
-+			     EUH * FUH, FUH * FUH, GUH * FUH, HUH * FUH};
-+uint16_t expecteduh8_7[8] = {AUH * GUH, BUH * GUH, CUH * GUH, DUH * GUH,
-+			     EUH * GUH, FUH * GUH, GUH * GUH, HUH * GUH};
-+uint16_t expecteduh8_8[8] = {AUH * HUH, BUH * HUH, CUH * HUH, DUH * HUH,
-+			     EUH * HUH, FUH * HUH, GUH * HUH, HUH * HUH};
-+uint16_t _elemuhA = AUH;
-+uint16_t _elemuhB = BUH;
-+uint16_t _elemuhC = CUH;
-+uint16_t _elemuhD = DUH;
-+uint16_t _elemuhE = EUH;
-+uint16_t _elemuhF = FUH;
-+uint16_t _elemuhG = GUH;
-+uint16_t _elemuhH = HUH;
-+
-+void
-+check_v2sf (float32_t elemA, float32_t elemB)
-+{
-+  int32_t indx;
-+  const float32_t vec32x2_buf[2] = {A, B};
-+  float32x2_t vec32x2_src = vld1_f32 (vec32x2_buf);
-+  float32_t vec32x2_res[2];
-+
-+  vst1_f32 (vec32x2_res, vmul_n_f32 (vec32x2_src, elemA));
-+
-+  for (indx = 0; indx < 2; indx++)
-+    if (* (uint32_t *) &vec32x2_res[indx] != * (uint32_t *) &expected2_1[indx])
-+      abort ();
-+
-+  vst1_f32 (vec32x2_res, vmul_n_f32 (vec32x2_src, elemB));
-+
-+  for (indx = 0; indx < 2; indx++)
-+    if (* (uint32_t *) &vec32x2_res[indx] != * (uint32_t *) &expected2_2[indx])
-+      abort ();
-+
-+/* { dg-final { scan-assembler-times "fmul\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.s\\\[0\\\]" 2 } } */
-+}
-+
-+void
-+check_v4sf (float32_t elemA, float32_t elemB, float32_t elemC, float32_t elemD)
-+{
-+  int32_t indx;
-+  const float32_t vec32x4_buf[4] = {A, B, C, D};
-+  float32x4_t vec32x4_src = vld1q_f32 (vec32x4_buf);
-+  float32_t vec32x4_res[4];
-+
-+  vst1q_f32 (vec32x4_res, vmulq_n_f32 (vec32x4_src, elemA));
-+
-+  for (indx = 0; indx < 4; indx++)
-+    if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_1[indx])
-+      abort ();
-+
-+  vst1q_f32 (vec32x4_res, vmulq_n_f32 (vec32x4_src, elemB));
-+
-+  for (indx = 0; indx < 4; indx++)
-+    if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_2[indx])
-+      abort ();
-+
-+  vst1q_f32 (vec32x4_res, vmulq_n_f32 (vec32x4_src, elemC));
-+
-+  for (indx = 0; indx < 4; indx++)
-+    if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_3[indx])
-+      abort ();
-+
-+  vst1q_f32 (vec32x4_res, vmulq_n_f32 (vec32x4_src, elemD));
-+
-+  for (indx = 0; indx < 4; indx++)
-+    if (* (uint32_t *) &vec32x4_res[indx] != * (uint32_t *) &expected4_4[indx])
-+      abort ();
-+
-+/* { dg-final { scan-assembler-times "fmul\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.s\\\[0\\\]" 4 } } */
-+}
-+
-+void
-+check_v2df (float64_t elemdC, float64_t elemdD)
-+{
-+  int32_t indx;
-+  const float64_t vec64x2_buf[2] = {AD, BD};
-+  float64x2_t vec64x2_src = vld1q_f64 (vec64x2_buf);
-+  float64_t vec64x2_res[2];
-+
-+  vst1q_f64 (vec64x2_res, vmulq_n_f64 (vec64x2_src, elemdC));
-+
-+  for (indx = 0; indx < 2; indx++)
-+    if (* (uint64_t *) &vec64x2_res[indx] != * (uint64_t *) &expectedd2_1[indx])
-+      abort ();
-+
-+  vst1q_f64 (vec64x2_res, vmulq_n_f64 (vec64x2_src, elemdD));
-+
-+  for (indx = 0; indx < 2; indx++)
-+    if (* (uint64_t *) &vec64x2_res[indx] != * (uint64_t *) &expectedd2_2[indx])
-+      abort ();
-+
-+/* { dg-final { scan-assembler-times "fmul\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.d\\\[0\\\]" 2 } } */
-+}
-+
-+void
-+check_v2si (int32_t elemsA, int32_t elemsB)
-+{
-+  int32_t indx;
-+  const int32_t vecs32x2_buf[2] = {AS, BS};
-+  int32x2_t vecs32x2_src = vld1_s32 (vecs32x2_buf);
-+  int32_t vecs32x2_res[2];
-+
-+  vst1_s32 (vecs32x2_res, vmul_n_s32 (vecs32x2_src, elemsA));
-+
-+  for (indx = 0; indx < 2; indx++)
-+    if (vecs32x2_res[indx] != expecteds2_1[indx])
-+      abort ();
-+
-+  vst1_s32 (vecs32x2_res, vmul_n_s32 (vecs32x2_src, elemsB));
-+
-+  for (indx = 0; indx < 2; indx++)
-+    if (vecs32x2_res[indx] != expecteds2_2[indx])
-+      abort ();
-+}
-+
-+void
-+check_v2si_unsigned (uint32_t elemusA, uint32_t elemusB)
-+{
-+  int indx;
-+  const uint32_t vecus32x2_buf[2] = {AUS, BUS};
-+  uint32x2_t vecus32x2_src = vld1_u32 (vecus32x2_buf);
-+  uint32_t vecus32x2_res[2];
-+
-+  vst1_u32 (vecus32x2_res, vmul_n_u32 (vecus32x2_src, elemusA));
-+
-+  for (indx = 0; indx < 2; indx++)
-+    if (vecus32x2_res[indx] != expectedus2_1[indx])
-+      abort ();
-+
-+  vst1_u32 (vecus32x2_res, vmul_n_u32 (vecus32x2_src, elemusB));
-+
-+  for (indx = 0; indx < 2; indx++)
-+    if (vecus32x2_res[indx] != expectedus2_2[indx])
-+      abort ();
-+
-+/* { dg-final { scan-assembler-times "\tmul\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.s\\\[0\\\]" 4 } } */
-+}
-+
-+void
-+check_v4si (int32_t elemsA, int32_t elemsB, int32_t elemsC, int32_t elemsD)
-+{
-+  int32_t indx;
-+  const int32_t vecs32x4_buf[4] = {AS, BS, CS, DS};
-+  int32x4_t vecs32x4_src = vld1q_s32 (vecs32x4_buf);
-+  int32_t vecs32x4_res[4];
-+
-+  vst1q_s32 (vecs32x4_res, vmulq_n_s32 (vecs32x4_src, elemsA));
-+
-+  for (indx = 0; indx < 4; indx++)
-+    if (vecs32x4_res[indx] != expecteds4_1[indx])
-+      abort ();
-+
-+  vst1q_s32 (vecs32x4_res, vmulq_n_s32 (vecs32x4_src, elemsB));
-+
-+  for (indx = 0; indx < 4; indx++)
-+    if (vecs32x4_res[indx] != expecteds4_2[indx])
-+      abort ();
-+
-+  vst1q_s32 (vecs32x4_res, vmulq_n_s32 (vecs32x4_src, elemsC));
-+
-+  for (indx = 0; indx < 4; indx++)
-+    if (vecs32x4_res[indx] != expecteds4_3[indx])
-+      abort ();
-+
-+  vst1q_s32 (vecs32x4_res, vmulq_n_s32 (vecs32x4_src, elemsD));
-+
-+  for (indx = 0; indx < 4; indx++)
-+    if (vecs32x4_res[indx] != expecteds4_4[indx])
-+      abort ();
-+}
-+
-+void
-+check_v4si_unsigned (uint32_t elemusA, uint32_t elemusB, uint32_t elemusC,
-+		     uint32_t elemusD)
-+{
-+  int indx;
-+  const uint32_t vecus32x4_buf[4] = {AUS, BUS, CUS, DUS};
-+  uint32x4_t vecus32x4_src = vld1q_u32 (vecus32x4_buf);
-+  uint32_t vecus32x4_res[4];
-+
-+  vst1q_u32 (vecus32x4_res, vmulq_n_u32 (vecus32x4_src, elemusA));
-+
-+  for (indx = 0; indx < 4; indx++)
-+    if (vecus32x4_res[indx] != expectedus4_1[indx])
-+      abort ();
-+
-+  vst1q_u32 (vecus32x4_res, vmulq_n_u32 (vecus32x4_src, elemusB));
-+
-+  for (indx = 0; indx < 4; indx++)
-+    if (vecus32x4_res[indx] != expectedus4_2[indx])
-+      abort ();
-+
-+  vst1q_u32 (vecus32x4_res, vmulq_n_u32 (vecus32x4_src, elemusC));
-+
-+  for (indx = 0; indx < 4; indx++)
-+    if (vecus32x4_res[indx] != expectedus4_3[indx])
-+      abort ();
-+
-+  vst1q_u32 (vecus32x4_res, vmulq_n_u32 (vecus32x4_src, elemusD));
-+
-+  for (indx = 0; indx < 4; indx++)
-+    if (vecus32x4_res[indx] != expectedus4_4[indx])
-+      abort ();
-+
-+/* { dg-final { scan-assembler-times "\tmul\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.s\\\[0\\\]" 8 } } */
-+}
-+
-+
-+void
-+check_v4hi (int16_t elemhA, int16_t elemhB, int16_t elemhC, int16_t elemhD)
-+{
-+  int32_t indx;
-+  const int16_t vech16x4_buf[4] = {AH, BH, CH, DH};
-+  int16x4_t vech16x4_src = vld1_s16 (vech16x4_buf);
-+  int16_t vech16x4_res[4];
-+
-+  vst1_s16 (vech16x4_res, vmul_n_s16 (vech16x4_src, elemhA));
-+
-+  for (indx = 0; indx < 4; indx++)
-+    if (vech16x4_res[indx] != expectedh4_1[indx])
-+      abort ();
-+
-+  vst1_s16 (vech16x4_res, vmul_n_s16 (vech16x4_src, elemhB));
-+
-+  for (indx = 0; indx < 4; indx++)
-+    if (vech16x4_res[indx] != expectedh4_2[indx])
-+      abort ();
-+
-+  vst1_s16 (vech16x4_res, vmul_n_s16 (vech16x4_src, elemhC));
-+
-+  for (indx = 0; indx < 4; indx++)
-+    if (vech16x4_res[indx] != expectedh4_3[indx])
-+      abort ();
-+
-+  vst1_s16 (vech16x4_res, vmul_n_s16 (vech16x4_src, elemhD));
-+
-+  for (indx = 0; indx < 4; indx++)
-+    if (vech16x4_res[indx] != expectedh4_4[indx])
-+      abort ();
-+}
-+
-+void
-+check_v4hi_unsigned (uint16_t elemuhA, uint16_t elemuhB, uint16_t elemuhC,
-+		     uint16_t elemuhD)
-+{
-+  int indx;
-+  const uint16_t vecuh16x4_buf[4] = {AUH, BUH, CUH, DUH};
-+  uint16x4_t vecuh16x4_src = vld1_u16 (vecuh16x4_buf);
-+  uint16_t vecuh16x4_res[4];
-+
-+  vst1_u16 (vecuh16x4_res, vmul_n_u16 (vecuh16x4_src, elemuhA));
-+
-+  for (indx = 0; indx < 4; indx++)
-+    if (vecuh16x4_res[indx] != expecteduh4_1[indx])
-+      abort ();
-+
-+  vst1_u16 (vecuh16x4_res, vmul_n_u16 (vecuh16x4_src, elemuhB));
-+
-+  for (indx = 0; indx < 4; indx++)
-+    if (vecuh16x4_res[indx] != expecteduh4_2[indx])
-+      abort ();
-+
-+  vst1_u16 (vecuh16x4_res, vmul_n_u16 (vecuh16x4_src, elemuhC));
-+
-+  for (indx = 0; indx < 4; indx++)
-+    if (vecuh16x4_res[indx] != expecteduh4_3[indx])
-+      abort ();
-+
-+  vst1_u16 (vecuh16x4_res, vmul_n_u16 (vecuh16x4_src, elemuhD));
-+
-+  for (indx = 0; indx < 4; indx++)
-+    if (vecuh16x4_res[indx] != expecteduh4_4[indx])
-+      abort ();
-+
-+/* { dg-final { scan-assembler-times "mul\tv\[0-9\]+\.4h, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[0\\\]" 8 } } */
-+}
-+
-+void
-+check_v8hi (int16_t elemhA, int16_t elemhB, int16_t elemhC, int16_t elemhD,
-+	    int16_t elemhE, int16_t elemhF, int16_t elemhG, int16_t elemhH)
-+{
-+  int32_t indx;
-+  const int16_t vech16x8_buf[8] = {AH, BH, CH, DH, EH, FH, GH, HH};
-+  int16x8_t vech16x8_src = vld1q_s16 (vech16x8_buf);
-+  int16_t vech16x8_res[8];
-+
-+  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhA));
-+
-+  for (indx = 0; indx < 8; indx++)
-+    if (vech16x8_res[indx] != expectedh8_1[indx])
-+      abort ();
-+
-+  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhB));
-+
-+  for (indx = 0; indx < 8; indx++)
-+    if (vech16x8_res[indx] != expectedh8_2[indx])
-+      abort ();
-+
-+  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhC));
-+
-+  for (indx = 0; indx < 8; indx++)
-+    if (vech16x8_res[indx] != expectedh8_3[indx])
-+      abort ();
-+
-+  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhD));
-+
-+  for (indx = 0; indx < 8; indx++)
-+    if (vech16x8_res[indx] != expectedh8_4[indx])
-+      abort ();
-+
-+  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhE));
-+
-+  for (indx = 0; indx < 8; indx++)
-+    if (vech16x8_res[indx] != expectedh8_5[indx])
-+      abort ();
-+
-+  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhF));
-+
-+  for (indx = 0; indx < 8; indx++)
-+    if (vech16x8_res[indx] != expectedh8_6[indx])
-+      abort ();
-+
-+  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhG));
-+
-+  for (indx = 0; indx < 8; indx++)
-+    if (vech16x8_res[indx] != expectedh8_7[indx])
-+      abort ();
-+
-+  vst1q_s16 (vech16x8_res, vmulq_n_s16 (vech16x8_src, elemhH));
-+
-+  for (indx = 0; indx < 8; indx++)
-+    if (vech16x8_res[indx] != expectedh8_8[indx])
-+      abort ();
-+}
-+
-+void
-+check_v8hi_unsigned (uint16_t elemuhA, uint16_t elemuhB, uint16_t elemuhC,
-+		     uint16_t elemuhD, uint16_t elemuhE, uint16_t elemuhF,
-+		     uint16_t elemuhG, uint16_t elemuhH)
-+{
-+  int indx;
-+  const uint16_t vecuh16x8_buf[8] = {AUH, BUH, CUH, DUH, EUH, FUH, GUH, HUH};
-+  uint16x8_t vecuh16x8_src = vld1q_u16 (vecuh16x8_buf);
-+  uint16_t vecuh16x8_res[8];
-+
-+  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhA));
-+
-+  for (indx = 0; indx < 8; indx++)
-+    if (vecuh16x8_res[indx] != expecteduh8_1[indx])
-+      abort ();
-+
-+  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhB));
-+
-+  for (indx = 0; indx < 8; indx++)
-+    if (vecuh16x8_res[indx] != expecteduh8_2[indx])
-+      abort ();
-+
-+  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhC));
-+
-+  for (indx = 0; indx < 8; indx++)
-+    if (vecuh16x8_res[indx] != expecteduh8_3[indx])
-+      abort ();
-+
-+  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhD));
-+
-+  for (indx = 0; indx < 8; indx++)
-+    if (vecuh16x8_res[indx] != expecteduh8_4[indx])
-+      abort ();
-+
-+  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhE));
-+
-+  for (indx = 0; indx < 8; indx++)
-+    if (vecuh16x8_res[indx] != expecteduh8_5[indx])
-+      abort ();
-+
-+  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhF));
-+
-+  for (indx = 0; indx < 8; indx++)
-+    if (vecuh16x8_res[indx] != expecteduh8_6[indx])
-+      abort ();
-+
-+  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhG));
-+
-+  for (indx = 0; indx < 8; indx++)
-+    if (vecuh16x8_res[indx] != expecteduh8_7[indx])
-+      abort ();
-+
-+  vst1q_u16 (vecuh16x8_res, vmulq_n_u16 (vecuh16x8_src, elemuhH));
-+
-+  for (indx = 0; indx < 8; indx++)
-+    if (vecuh16x8_res[indx] != expecteduh8_8[indx])
-+      abort ();
-+
-+/* { dg-final { scan-assembler-times "mul\tv\[0-9\]+\.8h, v\[0-9\]+\.8h, v\[0-9\]+\.h\\\[0\\\]" 16 } } */
-+}
-+
-+int
-+main (void)
-+{
-+  check_v2sf (_elemA, _elemB);
-+  check_v4sf (_elemA, _elemB, _elemC, _elemD);
-+  check_v2df (_elemdC, _elemdD);
-+  check_v2si (_elemsA, _elemsB);
-+  check_v4si (_elemsA, _elemsB, _elemsC, _elemsD);
-+  check_v4hi (_elemhA, _elemhB, _elemhC, _elemhD);
-+  check_v8hi (_elemhA, _elemhB, _elemhC, _elemhD,
-+	      _elemhE, _elemhF, _elemhG, _elemhH);
-+  check_v2si_unsigned (_elemusA, _elemusB);
-+  check_v4si_unsigned (_elemusA, _elemusB, _elemusC, _elemusD);
-+  check_v4hi_unsigned (_elemuhA, _elemuhB, _elemuhC, _elemuhD);
-+  check_v8hi_unsigned (_elemuhA, _elemuhB, _elemuhC, _elemuhD,
-+		       _elemuhE, _elemuhF, _elemuhG, _elemuhH);
-+
-+  return 0;
-+}
-+
---- a/src/gcc/testsuite/gcc.target/aarch64/store-pair-1.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/store-pair-1.c
-@@ -1,5 +1,5 @@
- /* { dg-do compile } */
--/* { dg-options "-O2" } */
-+/* { dg-options "-O2 -mcpu=generic" } */
- 
- int f(int *a, int b)
- {
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/store_repeating_constant_1.c
-@@ -0,0 +1,11 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2 -mtune=generic" } */
-+
-+void
-+foo (unsigned long long *a)
-+{
-+  a[0] = 0x0140c0da0140c0daULL;
-+}
-+
-+/* { dg-final { scan-assembler-times "movk\\tw.*" 1 } } */
-+/* { dg-final { scan-assembler-times "stp\tw\[0-9\]+, w\[0-9\]+.*" 1 } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/store_repeating_constant_2.c
-@@ -0,0 +1,15 @@
-+/* { dg-do compile } */
-+/* { dg-options "-Os" } */
-+
-+/* Check that for -Os we synthesize only the bottom half and then
-+   store it twice with an STP rather than synthesizing it twice in each
-+   half of an X-reg.  */
-+
-+void
-+foo (unsigned long long *a)
-+{
-+  a[0] = 0xc0da0000c0daULL;
-+}
-+
-+/* { dg-final { scan-assembler-times "mov\\tw.*" 1 } } */
-+/* { dg-final { scan-assembler-times "stp\tw\[0-9\]+, w\[0-9\]+.*" 1 } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/struct_return.c
-@@ -0,0 +1,31 @@
-+/* Test the absence of a spurious move from x8 to x0 for functions
-+   return structures.  */
-+/* { dg-do compile } */
-+/* { dg-options "-O2" } */
-+
-+struct s
-+{
-+  long x;
-+  long y;
-+  long z;
-+};
-+
-+struct s __attribute__((noinline))
-+foo (long a, long d, long c)
-+{
-+  struct s b;
-+  b.x = a;
-+  b.y = d;
-+  b.z = c;
-+  return b;
-+}
-+
-+int
-+main (void)
-+{
-+  struct s x;
-+  x = foo ( 10, 20, 30);
-+  return x.x + x.y + x.z;
-+}
-+
-+/* { dg-final { scan-assembler-not "mov\tx0, x8" } } */
---- a/src/gcc/testsuite/gcc.target/aarch64/test_frame_10.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/test_frame_10.c
-@@ -4,8 +4,7 @@
-      * total frame size > 512.
-        area except outgoing <= 512
-      * number of callee-saved reg >= 2.
--     * Split stack adjustment into two subtractions.
--       the first subtractions could be optimized into "stp !".  */
-+     * Use a single stack adjustment, no writeback.  */
- 
- /* { dg-do run } */
- /* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
-@@ -15,6 +14,6 @@
- t_frame_pattern_outgoing (test10, 480, "x19", 24, a[8], a[9], a[10])
- t_frame_run (test10)
- 
--/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp, -\[0-9\]+\\\]!" 1 } } */
--/* { dg-final { scan-assembler-times "ldp\tx19, x30, \\\[sp\\\], \[0-9\]+" 1 } } */
-+/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp, \[0-9\]+\\\]" 1 } } */
-+/* { dg-final { scan-assembler-times "ldp\tx19, x30, \\\[sp, \[0-9\]+\\\]" 1 } } */
- 
---- a/src/gcc/testsuite/gcc.target/aarch64/test_frame_12.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/test_frame_12.c
-@@ -13,6 +13,6 @@ t_frame_run (test12)
- 
- /* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 1 } } */
- 
--/* Check epilogue using write-back.  */
--/* { dg-final { scan-assembler-times "ldp\tx29, x30, \\\[sp\\\], \[0-9\]+" 3 } } */
-+/* Check epilogue using no write-back.  */
-+/* { dg-final { scan-assembler-times "ldp\tx29, x30, \\\[sp, \[0-9\]+\\\]" 1 } } */
- 
---- a/src/gcc/testsuite/gcc.target/aarch64/test_frame_13.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/test_frame_13.c
-@@ -2,8 +2,7 @@
-      * without outgoing.
-      * total frame size > 512.
-      * number of callee-save reg >= 2.
--     * split the stack adjustment into two substractions,
--       the second could be optimized into "stp !".  */
-+     * Use a single stack adjustment, no writeback.  */
- 
- /* { dg-do run } */
- /* { dg-options "-O2 --save-temps" } */
-@@ -14,4 +13,4 @@ t_frame_pattern (test13, 700, )
- t_frame_run (test13)
- 
- /* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 1 } } */
--/* { dg-final { scan-assembler-times "stp\tx29, x30, \\\[sp, -\[0-9\]+\\\]!" 2 } } */
-+/* { dg-final { scan-assembler-times "stp\tx29, x30, \\\[sp\\\]" 1 } } */
---- a/src/gcc/testsuite/gcc.target/aarch64/test_frame_15.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/test_frame_15.c
-@@ -3,8 +3,7 @@
-      * total frame size > 512.
-        area except outgoing <= 512
-      * number of callee-save reg >= 2.
--     * split the stack adjustment into two substractions,
--       the first could be optimized into "stp !".  */
-+     * Use a single stack adjustment, no writeback.  */
- 
- /* { dg-do run } */
- /* { dg-options "-O2 --save-temps" } */
-@@ -15,4 +14,4 @@ t_frame_pattern_outgoing (test15, 480, , 8, a[8])
- t_frame_run (test15)
- 
- /* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 1 } } */
--/* { dg-final { scan-assembler-times "stp\tx29, x30, \\\[sp, -\[0-9\]+\\\]!" 3 } } */
-+/* { dg-final { scan-assembler-times "stp\tx29, x30, \\\[sp, \[0-9\]+\\\]" 1 } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/test_frame_16.c
-@@ -0,0 +1,25 @@
-+/* Verify:
-+     * with outgoing.
-+     * single int register push.
-+     * varargs and callee-save size >= 256
-+     * Use 2 stack adjustments.  */
-+
-+/* { dg-do compile } */
-+/* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
-+
-+#define REP8(X) X,X,X,X,X,X,X,X
-+#define REP64(X) REP8(REP8(X))
-+
-+void outgoing (__builtin_va_list, ...);
-+
-+double vararg_outgoing (int x1, ...)
-+{
-+  double a1 = x1, a2 = x1 * 2, a3 = x1 * 3, a4 = x1 * 4, a5 = x1 * 5, a6 = x1 * 6;
-+  __builtin_va_list vl;
-+  __builtin_va_start (vl, x1);
-+  outgoing (vl, a1, a2, a3, a4, a5, a6, REP64 (1));
-+  __builtin_va_end (vl);
-+  return a1 + a2 + a3 + a4 + a5 + a6;
-+}
-+
-+/* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 2 } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/test_frame_17.c
-@@ -0,0 +1,21 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2 --save-temps" } */
-+
-+/* Test reuse of stack adjustment temporaries.  */
-+
-+void foo ();
-+
-+int reuse_mov (int i)
-+{
-+  int arr[1025];
-+  return arr[i];
-+}
-+
-+int no_reuse_mov (int i)
-+{
-+  int arr[1025];
-+  foo ();
-+  return arr[i];
-+}
-+
-+/* { dg-final { scan-assembler-times "mov\tx16, \[0-9\]+" 3 } } */
---- a/src/gcc/testsuite/gcc.target/aarch64/test_frame_6.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/test_frame_6.c
-@@ -3,8 +3,7 @@
-      * without outgoing.
-      * total frame size > 512.
-      * number of callee-saved reg == 1.
--     * split stack adjustment into two subtractions.
--       the second subtraction should use "str !".  */
-+     * use a single stack adjustment, no writeback.  */
- 
- /* { dg-do run } */
- /* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
-@@ -14,6 +13,7 @@
- t_frame_pattern (test6, 700, )
- t_frame_run (test6)
- 
--/* { dg-final { scan-assembler-times "str\tx30, \\\[sp, -\[0-9\]+\\\]!" 2 } } */
--/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp\\\], \[0-9\]+" 2 } } */
-+/* { dg-final { scan-assembler-times "str\tx30, \\\[sp\\\]" 1 } } */
-+/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp\\\]" 2 } } */
-+/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp\\\]," 1 } } */
- 
---- a/src/gcc/testsuite/gcc.target/aarch64/test_frame_7.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/test_frame_7.c
-@@ -3,8 +3,7 @@
-      * without outgoing.
-      * total frame size > 512.
-      * number of callee-saved reg == 2.
--     * split stack adjustment into two subtractions.
--       the second subtraction should use "stp !".  */
-+     * use a single stack adjustment, no writeback.  */
- 
- /* { dg-do run } */
- /* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
-@@ -14,6 +13,6 @@
- t_frame_pattern (test7, 700, "x19")
- t_frame_run (test7)
- 
--/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp, -\[0-9\]+\\\]!" 1 } } */
--/* { dg-final { scan-assembler-times "ldp\tx19, x30, \\\[sp\\\], \[0-9\]+" 1 } } */
-+/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp]" 1 } } */
-+/* { dg-final { scan-assembler-times "ldp\tx19, x30, \\\[sp\\\]" 1 } } */
- 
---- a/src/gcc/testsuite/gcc.target/aarch64/test_frame_8.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/test_frame_8.c
-@@ -12,6 +12,6 @@
- t_frame_pattern_outgoing (test8, 700, , 8, a[8])
- t_frame_run (test8)
- 
--/* { dg-final { scan-assembler-times "str\tx30, \\\[sp, -\[0-9\]+\\\]!" 3 } } */
--/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp\\\], \[0-9\]+" 3 } } */
-+/* { dg-final { scan-assembler-times "str\tx30, \\\[sp, \[0-9\]+\\\]" 1 } } */
-+/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp, \[0-9\]+\\\]" 1 } } */
- 
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/thunderxloadpair.c
-@@ -0,0 +1,20 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2 -mcpu=thunderx" } */
-+
-+struct ldp
-+{
-+  long long c;
-+  int a, b;
-+};
-+
-+
-+int f(struct ldp *a)
-+{
-+  return a->a + a->b;
-+}
-+
-+
-+/* We know the alignement of a->a to be 8 byte aligned so it is profitable
-+   to do ldp. */
-+/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 1 } } */
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/thunderxnoloadpair.c
-@@ -0,0 +1,17 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2 -mcpu=thunderx" } */
-+
-+struct noldp
-+{
-+  int a, b;
-+};
-+
-+
-+int f(struct noldp *a)
-+{
-+  return a->a + a->b;
-+}
-+
-+/* We know the alignement of a->a to be 4 byte aligned so it is not profitable
-+   to do ldp. */
-+/* { dg-final { scan-assembler-not "ldp\tw\[0-9\]+, w\[0-9\]" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/ubfiz_lsl_1.c
-@@ -0,0 +1,13 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2" } */
-+
-+/* Check that an X-reg UBFIZ can be simplified into a W-reg LSL.  */
-+
-+long long
-+f2 (long long x)
-+{
-+  return (x << 5) & 0xffffffff;
-+}
-+
-+/* { dg-final { scan-assembler "lsl\tw" } } */
-+/* { dg-final { scan-assembler-not "ubfiz\tx" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/ubfx_lsr_1.c
-@@ -0,0 +1,14 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2" } */
-+
-+/* Check that an X-reg UBFX can be simplified into a W-reg LSR.  */
-+
-+int
-+f (unsigned long long x)
-+{
-+  x = (x >> 24) & 255;
-+  return x + 1;
-+}
-+
-+/* { dg-final { scan-assembler "lsr\tw" } } */
-+/* { dg-final { scan-assembler-not "ubfx\tx" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/va_arg_1.c
-@@ -0,0 +1,11 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2 --save-temps" } */
-+
-+int
-+f (int a, ...)
-+{
-+  /* { dg-final { scan-assembler-not "str" } } */
-+  return a;
-+}
-+
-+/* { dg-final { cleanup-saved-temps } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/va_arg_2.c
-@@ -0,0 +1,18 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2 --save-temps" } */
-+
-+int
-+foo (char *fmt, ...)
-+{
-+  int d;
-+  __builtin_va_list ap;
-+
-+  __builtin_va_start (ap, fmt);
-+  d = __builtin_va_arg (ap, int);
-+  __builtin_va_end (ap);
-+
-+  /* { dg-final { scan-assembler-not "x7" } } */
-+  return d;
-+}
-+
-+/* { dg-final { cleanup-saved-temps } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/va_arg_3.c
-@@ -0,0 +1,26 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2 --save-temps" } */
-+
-+int d2i (double a);
-+
-+int
-+foo (char *fmt, ...)
-+{
-+  int d, e;
-+  double f, g;
-+  __builtin_va_list ap;
-+
-+  __builtin_va_start (ap, fmt);
-+  d = __builtin_va_arg (ap, int);
-+  f = __builtin_va_arg (ap, double);
-+  g = __builtin_va_arg (ap, double);
-+  d += d2i (f);
-+  d += d2i (g);
-+  __builtin_va_end (ap);
-+
-+  /* { dg-final { scan-assembler-not "x7" } } */
-+  /* { dg-final { scan-assembler-not "q7" } } */
-+  return d;
-+}
-+
-+/* { dg-final { cleanup-saved-temps } } */
---- a/src/gcc/testsuite/gcc.target/aarch64/vect-abs-compile.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/vect-abs-compile.c
-@@ -1,6 +1,6 @@
- 
- /* { dg-do compile } */
--/* { dg-options "-O3" } */
-+/* { dg-options "-O3 -fno-vect-cost-model" } */
- 
- #define N 16
- 
---- a/src/gcc/testsuite/gcc.target/aarch64/vect-clz.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/vect-clz.c
-@@ -1,5 +1,5 @@
- /* { dg-do run } */
--/* { dg-options "-O3 -save-temps -fno-inline" } */
-+/* { dg-options "-O3 -save-temps -fno-inline -fno-vect-cost-model" } */
- 
- extern void abort ();
- 
---- a/src/gcc/testsuite/gcc.target/aarch64/vect-fcm-eq-d.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/vect-fcm-eq-d.c
-@@ -1,5 +1,5 @@
- /* { dg-do run } */
--/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-unroll-loops --save-temps -fno-inline" } */
-+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-unroll-loops --save-temps -fno-inline -fno-vect-cost-model" } */
- 
- #define FTYPE double
- #define ITYPE long
---- a/src/gcc/testsuite/gcc.target/aarch64/vect-fcm-ge-d.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/vect-fcm-ge-d.c
-@@ -1,5 +1,5 @@
- /* { dg-do run } */
--/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-unroll-loops --save-temps -fno-inline" } */
-+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-unroll-loops --save-temps -fno-inline -fno-vect-cost-model" } */
- 
- #define FTYPE double
- #define ITYPE long
---- a/src/gcc/testsuite/gcc.target/aarch64/vect-fcm-gt-d.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/vect-fcm-gt-d.c
-@@ -1,5 +1,5 @@
- /* { dg-do run } */
--/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-unroll-loops --save-temps -fno-inline" } */
-+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-unroll-loops --save-temps -fno-inline -fno-vect-cost-model" } */
- 
- #define FTYPE double
- #define ITYPE long
---- a/src/gcc/testsuite/gcc.target/aarch64/vect-fmovd-zero.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/vect-fmovd-zero.c
-@@ -1,5 +1,5 @@
- /* { dg-do compile } */
--/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all" } */
-+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-vect-cost-model" } */
- 
- #define N 32
- 
---- a/src/gcc/testsuite/gcc.target/aarch64/vect-fmovd.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/vect-fmovd.c
-@@ -1,5 +1,5 @@
- /* { dg-do compile } */
--/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all" } */
-+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-vect-cost-model" } */
- 
- #define N 32
- 
---- a/src/gcc/testsuite/gcc.target/aarch64/vect-fmovf-zero.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/vect-fmovf-zero.c
-@@ -1,5 +1,5 @@
- /* { dg-do compile } */
--/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all" } */
-+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-vect-cost-model" } */
- 
- #define N 32
- 
---- a/src/gcc/testsuite/gcc.target/aarch64/vect-fmovf.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/vect-fmovf.c
-@@ -1,5 +1,5 @@
- /* { dg-do compile } */
--/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all" } */
-+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -fno-vect-cost-model" } */
- 
- #define N 32
- 
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/vect_copy_lane_1.c
-@@ -0,0 +1,86 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O3" } */
-+
-+#include "arm_neon.h"
-+
-+#define BUILD_TEST(TYPE1, TYPE2, Q1, Q2, SUFFIX, INDEX1, INDEX2)	\
-+TYPE1 __attribute__((noinline,noclone))					\
-+test_copy##Q1##_lane##Q2##_##SUFFIX (TYPE1 a, TYPE2 b)			\
-+{									\
-+  return vcopy##Q1##_lane##Q2##_##SUFFIX (a, INDEX1, b, INDEX2);	\
-+}
-+
-+/* vcopy_lane.  */
-+BUILD_TEST (poly8x8_t, poly8x8_t, , , p8, 7, 6)
-+BUILD_TEST (int8x8_t,  int8x8_t,  , , s8, 7, 6)
-+BUILD_TEST (uint8x8_t, uint8x8_t, , , u8, 7, 6)
-+/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[7\\\], v1.b\\\[6\\\]" 3 } } */
-+BUILD_TEST (poly16x4_t, poly16x4_t, , , p16, 3, 2)
-+BUILD_TEST (int16x4_t,  int16x4_t,  , , s16, 3, 2)
-+BUILD_TEST (uint16x4_t, uint16x4_t, , , u16, 3, 2)
-+/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[3\\\], v1.h\\\[2\\\]" 3 } } */
-+BUILD_TEST (float32x2_t, float32x2_t, , , f32, 1, 0)
-+BUILD_TEST (int32x2_t,   int32x2_t,   , , s32, 1, 0)
-+BUILD_TEST (uint32x2_t,  uint32x2_t,  , , u32, 1, 0)
-+/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[1\\\], v1.s\\\[0\\\]" 3 } } */
-+BUILD_TEST (int64x1_t,   int64x1_t,   , , s64, 0, 0)
-+BUILD_TEST (uint64x1_t,  uint64x1_t,  , , u64, 0, 0)
-+BUILD_TEST (float64x1_t, float64x1_t, , , f64, 0, 0)
-+/* { dg-final { scan-assembler-times "fmov\\td0, d1" 3 } } */
-+
-+/* vcopy_laneq.  */
-+
-+BUILD_TEST (poly8x8_t, poly8x16_t, , q, p8, 7, 15)
-+BUILD_TEST (int8x8_t,  int8x16_t,  , q, s8, 7, 15)
-+BUILD_TEST (uint8x8_t, uint8x16_t, , q, u8, 7, 15)
-+/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[7\\\], v1.b\\\[15\\\]" 3 } } */
-+BUILD_TEST (poly16x4_t, poly16x8_t, , q, p16, 3, 7)
-+BUILD_TEST (int16x4_t,  int16x8_t,  , q, s16, 3, 7)
-+BUILD_TEST (uint16x4_t, uint16x8_t, , q, u16, 3, 7)
-+/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[3\\\], v1.h\\\[7\\\]" 3 } } */
-+BUILD_TEST (float32x2_t, float32x4_t, , q, f32, 1, 3)
-+BUILD_TEST (int32x2_t,   int32x4_t,   , q, s32, 1, 3)
-+BUILD_TEST (uint32x2_t,  uint32x4_t,  , q, u32, 1, 3)
-+/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[1\\\], v1.s\\\[3\\\]" 3 } } */
-+BUILD_TEST (float64x1_t, float64x2_t, , q, f64, 0, 1)
-+BUILD_TEST (int64x1_t,  int64x2_t,    , q, s64, 0, 1)
-+BUILD_TEST (uint64x1_t, uint64x2_t,   , q, u64, 0, 1)
-+/* XFAIL due to PR 71307.  */
-+/* { dg-final { scan-assembler-times "dup\\td0, v1.d\\\[1\\\]" 3 { xfail *-*-* } } } */
-+
-+/* vcopyq_lane.  */
-+BUILD_TEST (poly8x16_t, poly8x8_t, q, , p8, 15, 7)
-+BUILD_TEST (int8x16_t,  int8x8_t,  q, , s8, 15, 7)
-+BUILD_TEST (uint8x16_t, uint8x8_t, q, , u8, 15, 7)
-+/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[15\\\], v1.b\\\[7\\\]" 3 } } */
-+BUILD_TEST (poly16x8_t, poly16x4_t, q, , p16, 7, 3)
-+BUILD_TEST (int16x8_t,  int16x4_t,  q, , s16, 7, 3)
-+BUILD_TEST (uint16x8_t, uint16x4_t, q, , u16, 7, 3)
-+/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[7\\\], v1.h\\\[3\\\]" 3 } } */
-+BUILD_TEST (float32x4_t, float32x2_t, q, , f32, 3, 1)
-+BUILD_TEST (int32x4_t,   int32x2_t,   q, , s32, 3, 1)
-+BUILD_TEST (uint32x4_t,  uint32x2_t,  q, , u32, 3, 1)
-+/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[3\\\], v1.s\\\[1\\\]" 3 } } */
-+BUILD_TEST (float64x2_t, float64x1_t, q, , f64, 1, 0)
-+BUILD_TEST (int64x2_t,   int64x1_t,   q, , s64, 1, 0)
-+BUILD_TEST (uint64x2_t,  uint64x1_t,  q, , u64, 1, 0)
-+/* { dg-final { scan-assembler-times "ins\\tv0.d\\\[1\\\], v1.d\\\[0\\\]" 3 } } */
-+
-+/* vcopyq_laneq.  */
-+
-+BUILD_TEST (poly8x16_t, poly8x16_t, q, q, p8, 14, 15)
-+BUILD_TEST (int8x16_t,  int8x16_t,  q, q, s8, 14, 15)
-+BUILD_TEST (uint8x16_t, uint8x16_t, q, q, u8, 14, 15)
-+/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[14\\\], v1.b\\\[15\\\]" 3 } } */
-+BUILD_TEST (poly16x8_t, poly16x8_t, q, q, p16, 6, 7)
-+BUILD_TEST (int16x8_t,  int16x8_t,  q, q, s16, 6, 7)
-+BUILD_TEST (uint16x8_t, uint16x8_t, q, q, u16, 6, 7)
-+/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[6\\\], v1.h\\\[7\\\]" 3 } } */
-+BUILD_TEST (float32x4_t, float32x4_t, q, q, f32, 2, 3)
-+BUILD_TEST (int32x4_t,   int32x4_t,   q, q, s32, 2, 3)
-+BUILD_TEST (uint32x4_t,  uint32x4_t,  q, q, u32, 2, 3)
-+/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[2\\\], v1.s\\\[3\\\]" 3 } } */
-+BUILD_TEST (float64x2_t, float64x2_t, q, q, f64, 1, 1)
-+BUILD_TEST (int64x2_t,   int64x2_t,   q,  q, s64, 1, 1)
-+BUILD_TEST (uint64x2_t,  uint64x2_t,  q, q, u64, 1, 1)
-+/* { dg-final { scan-assembler-times "ins\\tv0.d\\\[1\\\], v1.d\\\[1\\\]" 3 } } */
---- a/src/gcc/testsuite/gcc.target/aarch64/vect_ctz_1.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/vect_ctz_1.c
-@@ -1,5 +1,5 @@
- /* { dg-do run } */
--/* { dg-options "-O3 -save-temps -fno-inline" } */
-+/* { dg-options "-O3 -save-temps -fno-inline -fno-vect-cost-model" } */
- 
- extern void abort ();
- 
---- a/src/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c
-+++ b/src/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c
-@@ -38,14 +38,14 @@ f11 (void)
-   return sum;
- }
- 
--char arr_c[100][100];
-+char arr_c[100];
- char
- f12 (void)
- {
-   int i;
-   char sum = 0;
-   for (i = 0; i < 100; i++)
--    sum += arr_c[i][0] * arr_c[0][i];
-+    sum += arr_c[i] * arr_c[i];
-   return sum;
- }
- 
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/vget_set_lane_1.c
-@@ -0,0 +1,72 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2" } */
-+
-+#include "arm_neon.h"
-+
-+#define BUILD_TEST(TYPE1, TYPE2, Q1, Q2, SUFFIX, INDEX1, INDEX2)	\
-+TYPE1 __attribute__((noinline,noclone))				\
-+test_copy##Q1##_lane##Q2##_##SUFFIX (TYPE1 a, TYPE2 b)			\
-+{									\
-+  return vset##Q1##_lane_##SUFFIX (vget##Q2##_lane_##SUFFIX (b, INDEX2),\
-+				    a, INDEX1);				\
-+}
-+
-+BUILD_TEST (poly8x8_t, poly8x8_t, , , p8, 7, 6)
-+BUILD_TEST (int8x8_t,  int8x8_t,  , , s8, 7, 6)
-+BUILD_TEST (uint8x8_t, uint8x8_t, , , u8, 7, 6)
-+/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[7\\\], v1.b\\\[6\\\]" 3 } } */
-+BUILD_TEST (poly16x4_t, poly16x4_t, , , p16, 3, 2)
-+BUILD_TEST (int16x4_t,  int16x4_t,  , , s16, 3, 2)
-+BUILD_TEST (uint16x4_t, uint16x4_t, , , u16, 3, 2)
-+/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[3\\\], v1.h\\\[2\\\]" 3 } } */
-+BUILD_TEST (float32x2_t, float32x2_t, , , f32, 1, 0)
-+BUILD_TEST (int32x2_t,   int32x2_t,   , , s32, 1, 0)
-+BUILD_TEST (uint32x2_t,  uint32x2_t,  , , u32, 1, 0)
-+/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[1\\\], v1.s\\\[0\\\]" 3 } } */
-+
-+BUILD_TEST (poly8x8_t, poly8x16_t, , q, p8, 7, 15)
-+BUILD_TEST (int8x8_t,  int8x16_t,  , q, s8, 7, 15)
-+BUILD_TEST (uint8x8_t, uint8x16_t, , q, u8, 7, 15)
-+/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[7\\\], v1.b\\\[15\\\]" 3 } } */
-+BUILD_TEST (poly16x4_t, poly16x8_t, , q, p16, 3, 7)
-+BUILD_TEST (int16x4_t,  int16x8_t,  , q, s16, 3, 7)
-+BUILD_TEST (uint16x4_t, uint16x8_t, , q, u16, 3, 7)
-+/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[3\\\], v1.h\\\[7\\\]" 3 } } */
-+BUILD_TEST (float32x2_t, float32x4_t, , q, f32, 1, 3)
-+BUILD_TEST (int32x2_t,   int32x4_t,   , q, s32, 1, 3)
-+BUILD_TEST (uint32x2_t,  uint32x4_t,  , q, u32, 1, 3)
-+/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[1\\\], v1.s\\\[3\\\]" 3 } } */
-+
-+BUILD_TEST (poly8x16_t, poly8x8_t, q, , p8, 15, 7)
-+BUILD_TEST (int8x16_t,  int8x8_t,  q, , s8, 15, 7)
-+BUILD_TEST (uint8x16_t, uint8x8_t, q, , u8, 15, 7)
-+/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[15\\\], v1.b\\\[7\\\]" 3 } } */
-+BUILD_TEST (poly16x8_t, poly16x4_t, q, , p16, 7, 3)
-+BUILD_TEST (int16x8_t,  int16x4_t,  q, , s16, 7, 3)
-+BUILD_TEST (uint16x8_t, uint16x4_t, q, , u16, 7, 3)
-+/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[7\\\], v1.h\\\[3\\\]" 3 } } */
-+BUILD_TEST (float32x4_t, float32x2_t, q, , f32, 3, 1)
-+BUILD_TEST (int32x4_t,   int32x2_t,   q, , s32, 3, 1)
-+BUILD_TEST (uint32x4_t,  uint32x2_t,  q, , u32, 3, 1)
-+/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[3\\\], v1.s\\\[1\\\]" 3 } } */
-+BUILD_TEST (float64x2_t, float64x1_t, q, , f64, 1, 0)
-+BUILD_TEST (int64x2_t,   int64x1_t,   q, , s64, 1, 0)
-+BUILD_TEST (uint64x2_t,  uint64x1_t,  q, , u64, 1, 0)
-+/* { dg-final { scan-assembler-times "ins\\tv0.d\\\[1\\\], v1.d\\\[0\\\]" 3 } } */
-+
-+BUILD_TEST (poly8x16_t, poly8x16_t, q, q, p8, 14, 15)
-+BUILD_TEST (int8x16_t,  int8x16_t,  q, q, s8, 14, 15)
-+BUILD_TEST (uint8x16_t, uint8x16_t, q, q, u8, 14, 15)
-+/* { dg-final { scan-assembler-times "ins\\tv0.b\\\[14\\\], v1.b\\\[15\\\]" 3 } } */
-+BUILD_TEST (poly16x8_t, poly16x8_t, q, q, p16, 6, 7)
-+BUILD_TEST (int16x8_t,  int16x8_t,  q, q, s16, 6, 7)
-+BUILD_TEST (uint16x8_t, uint16x8_t, q, q, u16, 6, 7)
-+/* { dg-final { scan-assembler-times "ins\\tv0.h\\\[6\\\], v1.h\\\[7\\\]" 3 } } */
-+BUILD_TEST (float32x4_t, float32x4_t, q, q, f32, 2, 3)
-+BUILD_TEST (int32x4_t,   int32x4_t,   q, q, s32, 2, 3)
-+BUILD_TEST (uint32x4_t,  uint32x4_t,  q, q, u32, 2, 3)
-+/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[2\\\], v1.s\\\[3\\\]" 3 } } */
-+BUILD_TEST (float64x2_t, float64x2_t, q, q, f64, 1, 1)
-+BUILD_TEST (int64x2_t,   int64x2_t,   q,  q, s64, 1, 1)
-+BUILD_TEST (uint64x2_t,  uint64x2_t,  q, q, u64, 1, 1)
-+/* { dg-final { scan-assembler-times "ins\\tv0.d\\\[1\\\], v1.d\\\[1\\\]" 3 } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/aarch64/vminmaxnm.c
-@@ -0,0 +1,37 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2" } */
-+
-+#include "arm_neon.h"
-+
-+/* For each of these intrinsics, we map directly to an unspec in RTL.
-+   We're just using the argument directly and returning the result, so we
-+   can precisely specify the exact instruction pattern and register
-+   allocations we expect.  */
-+
-+float64x1_t
-+test_vmaxnm_f64 (float64x1_t a, float64x1_t b)
-+{
-+  /* { dg-final { scan-assembler-times "fmaxnm\td0, d0, d1" 1 } } */
-+  return vmaxnm_f64 (a, b);
-+}
-+
-+float64x1_t
-+test_vminnm_f64 (float64x1_t a, float64x1_t b)
-+{
-+  /* { dg-final { scan-assembler-times "fminnm\td0, d0, d1" 1 } } */
-+  return vminnm_f64 (a, b);
-+}
-+
-+float64x1_t
-+test_vmax_f64 (float64x1_t a, float64x1_t b)
-+{
-+  /* { dg-final { scan-assembler-times "fmax\td0, d0, d1" 1 } } */
-+  return vmax_f64 (a, b);
-+}
-+
-+float64x1_t
-+test_vmin_f64 (float64x1_t a, float64x1_t b)
-+{
-+  /* { dg-final { scan-assembler-times "fmin\td0, d0, d1" 1 } } */
-+  return vmin_f64 (a, b);
-+}
-\ No newline at end of file
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/aapcs/neon-vect10.c
-@@ -0,0 +1,32 @@
-+/* Test AAPCS layout (VFP variant for Neon types) */
-+
-+/* { dg-do run { target arm_eabi } } */
-+/* { dg-require-effective-target arm_hard_vfp_ok }  */
-+/* { dg-require-effective-target arm_neon_fp16_hw } */
-+/* { dg-add-options arm_neon_fp16 } */
-+
-+#ifndef IN_FRAMEWORK
-+#define VFP
-+#define NEON
-+#define TESTFILE "neon-vect10.c"
-+#include "neon-constants.h"
-+
-+#include "abitest.h"
-+#else
-+
-+ARG (int32x4_t, i32x4_constvec2, Q0) /* D0, D1.  */
-+#if defined (__ARM_BIG_ENDIAN)
-+ARG (__fp16, 3.0f, S4 + 2) /* D2, Q1.  */
-+#else
-+ARG (__fp16, 3.0f, S4) /* D2, Q1.  */
-+#endif
-+ARG (int32x4x2_t, i32x4x2_constvec1, Q2) /* Q2, Q3 - D4-D6 , s5-s12.  */
-+ARG (double, 12.0, D3) /* Backfill this particular argument.  */
-+#if defined (__ARM_BIG_ENDIAN)
-+ARG (__fp16, 5.0f, S5 + 2) /* Backfill in S5.  */
-+#else
-+ARG (__fp16, 5.0f, S5) /* Backfill in S5.  */
-+#endif
-+ARG (int32x4x2_t, i32x4x2_constvec2, STACK)
-+LAST_ARG (int, 3, R0)
-+#endif
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/aapcs/neon-vect9.c
-@@ -0,0 +1,24 @@
-+/* Test AAPCS layout (VFP variant for Neon types) */
-+
-+/* { dg-do run { target arm_eabi } } */
-+/* { dg-require-effective-target arm_hard_vfp_ok }  */
-+/* { dg-require-effective-target arm_neon_fp16_hw } */
-+/* { dg-add-options arm_neon_fp16 } */
-+
-+#ifndef IN_FRAMEWORK
-+#define VFP
-+#define NEON
-+#define TESTFILE "neon-vect9.c"
-+#include "neon-constants.h"
-+
-+#include "abitest.h"
-+#else
-+
-+ARG (int32x4_t, i32x4_constvec2, Q0) /* D0, D1.  */
-+#if defined (__ARM_BIG_ENDIAN)
-+ARG (__fp16, 3.0f, S4 + 2) /* D2, Q1 occupied.  */
-+#else
-+ARG (__fp16, 3.0f, S4) /* D2, Q1 occupied.  */
-+#endif
-+LAST_ARG (int, 3, R0)
-+#endif
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/aapcs/vfp18.c
-@@ -0,0 +1,28 @@
-+/* Test AAPCS layout (VFP variant)  */
-+
-+/* { dg-do run { target arm_eabi } }  */
-+/* { dg-require-effective-target arm_hard_vfp_ok }  */
-+/* { dg-require-effective-target arm_fp16_hw }  */
-+/* { dg-add-options arm_fp16_ieee }  */
-+
-+#ifndef IN_FRAMEWORK
-+#define VFP
-+#define TESTFILE "vfp18.c"
-+#include "abitest.h"
-+
-+#else
-+#if defined (__ARM_BIG_ENDIAN)
-+ARG (__fp16, 1.0f, S0 + 2)
-+#else
-+ARG (__fp16, 1.0f, S0)
-+#endif
-+ARG (float, 2.0f, S1)
-+ARG (double, 4.0, D1)
-+ARG (float, 2.0f, S4)
-+#if defined (__ARM_BIG_ENDIAN)
-+ARG (__fp16, 1.0f, S5 + 2)
-+#else
-+ARG (__fp16, 1.0f, S5)
-+#endif
-+LAST_ARG (int, 3, R0)
-+#endif
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/aapcs/vfp19.c
-@@ -0,0 +1,30 @@
-+/* Test AAPCS layout (VFP variant)  */
-+
-+/* { dg-do run { target arm_eabi } }  */
-+/* { dg-require-effective-target arm_hard_vfp_ok }  */
-+/* { dg-require-effective-target arm_fp16_hw }  */
-+/* { dg-add-options arm_fp16_ieee }  */
-+
-+#ifndef IN_FRAMEWORK
-+#define VFP
-+#define TESTFILE "vfp19.c"
-+
-+__complex__ x = 1.0+2.0i;
-+
-+#include "abitest.h"
-+#else
-+#if defined (__ARM_BIG_ENDIAN)
-+ARG (__fp16, 1.0f, S0 + 2)
-+#else
-+ARG (__fp16, 1.0f, S0)
-+#endif
-+ARG (float, 2.0f, S1)
-+ARG (__complex__ double, x, D1)
-+ARG (float, 3.0f, S6)
-+#if defined (__ARM_BIG_ENDIAN)
-+ARG (__fp16, 2.0f, S7 + 2)
-+#else
-+ARG (__fp16, 2.0f, S7)
-+#endif
-+LAST_ARG (int, 3, R0)
-+#endif
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/aapcs/vfp20.c
-@@ -0,0 +1,22 @@
-+/* Test AAPCS layout (VFP variant)  */
-+
-+/* { dg-do run { target arm_eabi } }  */
-+/* { dg-require-effective-target arm_hard_vfp_ok }  */
-+/* { dg-require-effective-target arm_fp16_hw }  */
-+/* { dg-add-options arm_fp16_ieee }  */
-+
-+#ifndef IN_FRAMEWORK
-+#define VFP
-+#define TESTFILE "vfp20.c"
-+
-+#define PCSATTR __attribute__((pcs("aapcs")))
-+
-+#include "abitest.h"
-+#else
-+ARG (float, 1.0f, R0)
-+ARG (double, 2.0, R2)
-+ARG (float, 3.0f, STACK)
-+ARG (__fp16, 2.0f, STACK+4)
-+LAST_ARG (double, 4.0, STACK+8)
-+#endif
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/aapcs/vfp21.c
-@@ -0,0 +1,26 @@
-+/* Test AAPCS layout (VFP variant)  */
-+
-+/* { dg-do run { target arm_eabi } }  */
-+/* { dg-require-effective-target arm_hard_vfp_ok }  */
-+/* { dg-require-effective-target arm_fp16_hw }  */
-+/* { dg-add-options arm_fp16_ieee }  */
-+
-+#ifndef IN_FRAMEWORK
-+#define VFP
-+#define TESTFILE "vfp21.c"
-+
-+#define PCSATTR __attribute__((pcs("aapcs")))
-+
-+#include "abitest.h"
-+#else
-+#if defined (__ARM_BIG_ENDIAN)
-+ARG (__fp16, 1.0f, R0 + 2)
-+#else
-+ARG (__fp16, 1.0f, R0)
-+#endif
-+ARG (double, 2.0, R2)
-+ARG (__fp16, 3.0f, STACK)
-+ARG (float, 2.0f, STACK+4)
-+LAST_ARG (double, 4.0, STACK+8)
-+#endif
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/aapcs/vfp22.c
-@@ -0,0 +1,28 @@
-+/* Test AAPCS layout (VFP variant)  */
-+
-+/* { dg-do run { target arm_eabi } }  */
-+/* { dg-require-effective-target arm_hard_vfp_ok }  */
-+/* { dg-require-effective-target arm_fp16_hw }  */
-+/* { dg-add-options arm_fp16_alternative }  */
-+
-+#ifndef IN_FRAMEWORK
-+#define VFP
-+#define TESTFILE "vfp22.c"
-+#include "abitest.h"
-+
-+#else
-+#if defined (__ARM_BIG_ENDIAN)
-+ARG (__fp16, 1.0f, S0 + 2)
-+#else
-+ARG (__fp16, 1.0f, S0)
-+#endif
-+ARG (float, 2.0f, S1)
-+ARG (double, 4.0, D1)
-+ARG (float, 2.0f, S4)
-+#if defined (__ARM_BIG_ENDIAN)
-+ARG (__fp16, 1.0f, S5 + 2)
-+#else
-+ARG (__fp16, 1.0f, S5)
-+#endif
-+LAST_ARG (int, 3, R0)
-+#endif
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/aapcs/vfp23.c
-@@ -0,0 +1,30 @@
-+/* Test AAPCS layout (VFP variant)  */
-+
-+/* { dg-do run { target arm_eabi } }  */
-+/* { dg-require-effective-target arm_hard_vfp_ok }  */
-+/* { dg-require-effective-target arm_fp16_hw }  */
-+/* { dg-add-options arm_fp16_alternative }  */
-+
-+#ifndef IN_FRAMEWORK
-+#define VFP
-+#define TESTFILE "vfp23.c"
-+
-+__complex__ x = 1.0+2.0i;
-+
-+#include "abitest.h"
-+#else
-+#if defined (__ARM_BIG_ENDIAN)
-+ARG (__fp16, 1.0f, S0 + 2)
-+#else
-+ARG (__fp16, 1.0f, S0)
-+#endif
-+ARG (float, 2.0f, S1)
-+ARG (__complex__ double, x, D1)
-+ARG (float, 3.0f, S6)
-+#if defined (__ARM_BIG_ENDIAN)
-+ARG (__fp16, 2.0f, S7 + 2)
-+#else
-+ARG (__fp16, 2.0f, S7)
-+#endif
-+LAST_ARG (int, 3, R0)
-+#endif
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/aapcs/vfp24.c
-@@ -0,0 +1,21 @@
-+/* Test AAPCS layout (VFP variant)  */
-+
-+/* { dg-do run { target arm_eabi } }  */
-+/* { dg-require-effective-target arm_hard_vfp_ok }  */
-+/* { dg-require-effective-target arm_fp16_hw }  */
-+/* { dg-add-options arm_fp16_alternative }  */
-+
-+#ifndef IN_FRAMEWORK
-+#define VFP
-+#define TESTFILE "vfp24.c"
-+
-+#define PCSATTR __attribute__((pcs("aapcs")))
-+
-+#include "abitest.h"
-+#else
-+ARG (float, 1.0f, R0)
-+ARG (double, 2.0, R2)
-+ARG (float, 3.0f, STACK)
-+ARG (__fp16, 2.0f, STACK+4)
-+LAST_ARG (double, 4.0, STACK+8)
-+#endif
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/aapcs/vfp25.c
-@@ -0,0 +1,25 @@
-+/* Test AAPCS layout (VFP variant)  */
-+
-+/* { dg-do run { target arm_eabi } }  */
-+/* { dg-require-effective-target arm_hard_vfp_ok }  */
-+/* { dg-require-effective-target arm_fp16_hw }  */
-+/* { dg-add-options arm_fp16_alternative }  */
-+
-+#ifndef IN_FRAMEWORK
-+#define VFP
-+#define TESTFILE "vfp25.c"
-+
-+#define PCSATTR __attribute__((pcs("aapcs")))
-+
-+#include "abitest.h"
-+#else
-+#if defined (__ARM_BIG_ENDIAN)
-+ARG (__fp16, 1.0f, R0 + 2)
-+#else
-+ARG (__fp16, 1.0f, R0)
-+#endif
-+ARG (double, 2.0, R2)
-+ARG (__fp16, 3.0f, STACK)
-+ARG (float, 2.0f, STACK+4)
-+LAST_ARG (double, 4.0, STACK+8)
-+#endif
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/armv5_thumb_isa.c
-@@ -0,0 +1,8 @@
-+/* { dg-require-effective-target arm_arch_v5_ok } */
-+/* { dg-add-options arm_arch_v5 } */
-+
-+#if __ARM_ARCH_ISA_THUMB
-+#error "__ARM_ARCH_ISA_THUMB defined for ARMv5"
-+#endif
-+
-+int foo;
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/armv8_2-fp16-arith-1.c
-@@ -0,0 +1,105 @@
-+/* { dg-do compile }  */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok }  */
-+/* { dg-options "-O2 -ffast-math" }  */
-+/* { dg-add-options arm_v8_2a_fp16_neon }  */
-+
-+/* Test instructions generated for half-precision arithmetic.  */
-+
-+typedef __fp16 float16_t;
-+typedef __simd64_float16_t float16x4_t;
-+typedef __simd128_float16_t float16x8_t;
-+
-+typedef short int16x4_t __attribute__ ((vector_size (8)));
-+typedef short int int16x8_t  __attribute__ ((vector_size (16)));
-+
-+float16_t
-+fp16_abs (float16_t a)
-+{
-+  return (a < 0) ? -a : a;
-+}
-+
-+#define TEST_UNOP(NAME, OPERATOR, TY)		\
-+  TY test_##NAME##_##TY (TY a)			\
-+  {						\
-+    return OPERATOR (a);			\
-+  }
-+
-+#define TEST_BINOP(NAME, OPERATOR, TY)		\
-+  TY test_##NAME##_##TY (TY a, TY b)		\
-+  {						\
-+    return a OPERATOR b;			\
-+  }
-+
-+#define TEST_CMP(NAME, OPERATOR, RTY, TY)	\
-+  RTY test_##NAME##_##TY (TY a, TY b)		\
-+  {						\
-+    return a OPERATOR b;			\
-+  }
-+
-+/* Scalars.  */
-+
-+TEST_UNOP (neg, -, float16_t)
-+TEST_UNOP (abs, fp16_abs, float16_t)
-+
-+TEST_BINOP (add, +, float16_t)
-+TEST_BINOP (sub, -, float16_t)
-+TEST_BINOP (mult, *, float16_t)
-+TEST_BINOP (div, /, float16_t)
-+
-+TEST_CMP (equal, ==, int, float16_t)
-+TEST_CMP (unequal, !=, int, float16_t)
-+TEST_CMP (lessthan, <, int, float16_t)
-+TEST_CMP (greaterthan, >, int, float16_t)
-+TEST_CMP (lessthanequal, <=, int, float16_t)
-+TEST_CMP (greaterthanqual, >=, int, float16_t)
-+
-+/* Vectors of size 4.  */
-+
-+TEST_UNOP (neg, -, float16x4_t)
-+
-+TEST_BINOP (add, +, float16x4_t)
-+TEST_BINOP (sub, -, float16x4_t)
-+TEST_BINOP (mult, *, float16x4_t)
-+TEST_BINOP (div, /, float16x4_t)
-+
-+TEST_CMP (equal, ==, int16x4_t, float16x4_t)
-+TEST_CMP (unequal, !=, int16x4_t, float16x4_t)
-+TEST_CMP (lessthan, <, int16x4_t, float16x4_t)
-+TEST_CMP (greaterthan, >, int16x4_t, float16x4_t)
-+TEST_CMP (lessthanequal, <=, int16x4_t, float16x4_t)
-+TEST_CMP (greaterthanqual, >=, int16x4_t, float16x4_t)
-+
-+/* Vectors of size 8.  */
-+
-+TEST_UNOP (neg, -, float16x8_t)
-+
-+TEST_BINOP (add, +, float16x8_t)
-+TEST_BINOP (sub, -, float16x8_t)
-+TEST_BINOP (mult, *, float16x8_t)
-+TEST_BINOP (div, /, float16x8_t)
-+
-+TEST_CMP (equal, ==, int16x8_t, float16x8_t)
-+TEST_CMP (unequal, !=, int16x8_t, float16x8_t)
-+TEST_CMP (lessthan, <, int16x8_t, float16x8_t)
-+TEST_CMP (greaterthan, >, int16x8_t, float16x8_t)
-+TEST_CMP (lessthanequal, <=, int16x8_t, float16x8_t)
-+TEST_CMP (greaterthanqual, >=, int16x8_t, float16x8_t)
-+
-+/* { dg-final { scan-assembler-times {vneg\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
-+/* { dg-final { scan-assembler-times {vneg\.f16\td[0-9]+, d[0-9]+} 1 } }  */
-+/* { dg-final { scan-assembler-times {vneg\.f16\tq[0-9]+, q[0-9]+} 1 } }  */
-+/* { dg-final { scan-assembler-times {vabs\.f16\ts[0-9]+, s[0-9]+} 2 } }  */
-+
-+/* { dg-final { scan-assembler-times {vadd\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } }  */
-+/* { dg-final { scan-assembler-times {vsub\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } }  */
-+/* { dg-final { scan-assembler-times {vmul\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } }  */
-+/* { dg-final { scan-assembler-times {vdiv\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } }  */
-+/* { dg-final { scan-assembler-times {vcmp\.f32\ts[0-9]+, s[0-9]+} 26 } }  */
-+/* { dg-final { scan-assembler-times {vcmpe\.f32\ts[0-9]+, s[0-9]+} 52 } }  */
-+
-+/* { dg-final { scan-assembler-not {vadd\.f32} } }  */
-+/* { dg-final { scan-assembler-not {vsub\.f32} } }  */
-+/* { dg-final { scan-assembler-not {vmul\.f32} } }  */
-+/* { dg-final { scan-assembler-not {vdiv\.f32} } }  */
-+/* { dg-final { scan-assembler-not {vcmp\.f16} } }  */
-+/* { dg-final { scan-assembler-not {vcmpe\.f16} } }  */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/armv8_2-fp16-conv-1.c
-@@ -0,0 +1,101 @@
-+/* { dg-do compile }  */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok }  */
-+/* { dg-options "-O2" }  */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+/* Test ARMv8.2 FP16 conversions.  */
-+#include <arm_fp16.h>
-+
-+float
-+f16_to_f32 (__fp16 a)
-+{
-+  return (float)a;
-+}
-+
-+float
-+f16_to_pf32 (__fp16* a)
-+{
-+  return (float)*a;
-+}
-+
-+short
-+f16_to_s16 (__fp16 a)
-+{
-+  return (short)a;
-+}
-+
-+short
-+pf16_to_s16 (__fp16* a)
-+{
-+  return (short)*a;
-+}
-+
-+/* { dg-final { scan-assembler-times {vcvtb\.f32\.f16\ts[0-9]+, s[0-9]+} 4 } }  */
-+
-+__fp16
-+f32_to_f16 (float a)
-+{
-+  return (__fp16)a;
-+}
-+
-+void
-+f32_to_pf16 (__fp16* x, float a)
-+{
-+  *x = (__fp16)a;
-+}
-+
-+__fp16
-+s16_to_f16 (short a)
-+{
-+  return (__fp16)a;
-+}
-+
-+void
-+s16_to_pf16 (__fp16* x, short a)
-+{
-+  *x = (__fp16)a;
-+}
-+
-+/* { dg-final { scan-assembler-times {vcvtb\.f16\.f32\ts[0-9]+, s[0-9]+} 4 } }  */
-+
-+float
-+s16_to_f32 (short a)
-+{
-+  return (float)a;
-+}
-+
-+/* { dg-final { scan-assembler-times {vcvt\.f32\.s32\ts[0-9]+, s[0-9]+} 3 } }  */
-+
-+short
-+f32_to_s16 (float a)
-+{
-+  return (short)a;
-+}
-+
-+/* { dg-final { scan-assembler-times {vcvt\.s32\.f32\ts[0-9]+, s[0-9]+} 3 } }  */
-+
-+unsigned short
-+f32_to_u16 (float a)
-+{
-+  return (unsigned short)a;
-+}
-+
-+/* { dg-final { scan-assembler-times {vcvt\.u32\.f32\ts[0-9]+, s[0-9]+} 1 } }  */
-+
-+short
-+f64_to_s16 (double a)
-+{
-+  return (short)a;
-+}
-+
-+/* { dg-final { scan-assembler-times {vcvt\.s32\.f64\ts[0-9]+, d[0-9]+} 1 } }  */
-+
-+unsigned short
-+f64_to_u16 (double a)
-+{
-+  return (unsigned short)a;
-+}
-+
-+/* { dg-final { scan-assembler-times {vcvt\.s32\.f64\ts[0-9]+, d[0-9]+} 1 } }  */
-+
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/armv8_2-fp16-move-1.c
-@@ -0,0 +1,165 @@
-+/* { dg-do compile }  */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok }  */
-+/* { dg-options "-O2" }  */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+__fp16
-+test_load_1 (__fp16* a)
-+{
-+  return *a;
-+}
-+
-+__fp16
-+test_load_2 (__fp16* a, int i)
-+{
-+  return a[i];
-+}
-+
-+/* { dg-final { scan-assembler-times {vld1\.16\t\{d[0-9]+\[[0-9]+\]\}, \[r[0-9]+\]} 2 } }  */
-+
-+void
-+test_store_1 (__fp16* a, __fp16 b)
-+{
-+  *a = b;
-+}
-+
-+void
-+test_store_2 (__fp16* a, int i, __fp16 b)
-+{
-+  a[i] = b;
-+}
-+
-+/* { dg-final { scan-assembler-times {vst1\.16\t\{d[0-9]+\[[0-9]+\]\}, \[r[0-9]+\]} 2 } }  */
-+
-+__fp16
-+test_load_store_1 (__fp16* a, int i, __fp16* b)
-+{
-+  a[i] = b[i];
-+}
-+
-+__fp16
-+test_load_store_2 (__fp16* a, int i, __fp16* b)
-+{
-+  a[i] = b[i + 2];
-+  return a[i];
-+}
-+/* { dg-final { scan-assembler-times {ldrh\tr[0-9]+} 2 } }  */
-+/* { dg-final { scan-assembler-times {strh\tr[0-9]+} 2 } }  */
-+
-+__fp16
-+test_select_1 (int sel, __fp16 a, __fp16 b)
-+{
-+  if (sel)
-+    return a;
-+  else
-+    return b;
-+}
-+
-+__fp16
-+test_select_2 (int sel, __fp16 a, __fp16 b)
-+{
-+  return sel ? a : b;
-+}
-+
-+__fp16
-+test_select_3 (__fp16 a, __fp16 b, __fp16 c)
-+{
-+  return (a == b) ? b : c;
-+}
-+
-+__fp16
-+test_select_4 (__fp16 a, __fp16 b, __fp16 c)
-+{
-+  return (a != b) ? b : c;
-+}
-+
-+__fp16
-+test_select_5 (__fp16 a, __fp16 b, __fp16 c)
-+{
-+  return (a < b) ? b : c;
-+}
-+
-+__fp16
-+test_select_6 (__fp16 a, __fp16 b, __fp16 c)
-+{
-+  return (a <= b) ? b : c;
-+}
-+
-+__fp16
-+test_select_7 (__fp16 a, __fp16 b, __fp16 c)
-+{
-+  return (a > b) ? b : c;
-+}
-+
-+__fp16
-+test_select_8 (__fp16 a, __fp16 b, __fp16 c)
-+{
-+  return (a >= b) ? b : c;
-+}
-+
-+/* { dg-final { scan-assembler-times {vseleq\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 4 } } */
-+/* { dg-final { scan-assembler-times {vselgt\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } }  */
-+/* { dg-final { scan-assembler-times {vselge\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } }  */
-+
-+/* { dg-final { scan-assembler-times {vmov\.f16\ts[0-9]+, r[0-9]+} 4 } }  */
-+/* { dg-final { scan-assembler-times {vmov\.f16\tr[0-9]+, s[0-9]+} 4 } }  */
-+
-+int
-+test_compare_1 (__fp16 a, __fp16 b)
-+{
-+  if (a == b)
-+    return -1;
-+  else
-+    return 0;
-+}
-+
-+int
-+test_compare_ (__fp16 a, __fp16 b)
-+{
-+  if (a != b)
-+    return -1;
-+  else
-+    return 0;
-+}
-+
-+int
-+test_compare_2 (__fp16 a, __fp16 b)
-+{
-+  if (a > b)
-+    return -1;
-+  else
-+    return 0;
-+}
-+
-+int
-+test_compare_3 (__fp16 a, __fp16 b)
-+{
-+  if (a >= b)
-+    return -1;
-+  else
-+    return 0;
-+}
-+
-+int
-+test_compare_4 (__fp16 a, __fp16 b)
-+{
-+  if (a < b)
-+    return -1;
-+  else
-+    return 0;
-+}
-+
-+int
-+test_compare_5 (__fp16 a, __fp16 b)
-+{
-+  if (a <= b)
-+    return -1;
-+  else
-+    return 0;
-+}
-+
-+/* { dg-final { scan-assembler-not {vcmp\.f16} } }  */
-+/* { dg-final { scan-assembler-not {vcmpe\.f16} } }  */
-+
-+/* { dg-final { scan-assembler-times {vcmp\.f32} 4 } }  */
-+/* { dg-final { scan-assembler-times {vcmpe\.f32} 8 } }  */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-1.c
-@@ -0,0 +1,490 @@
-+/* { dg-do compile }  */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok }  */
-+/* { dg-options "-O2" }  */
-+/* { dg-add-options arm_v8_2a_fp16_neon }  */
-+
-+/* Test instructions generated for the FP16 vector intrinsics.  */
-+
-+#include <arm_neon.h>
-+
-+#define MSTRCAT(L, str)	L##str
-+
-+#define UNOP_TEST(insn)				\
-+  float16x4_t					\
-+  MSTRCAT (test_##insn, _16x4) (float16x4_t a)	\
-+  {						\
-+    return MSTRCAT (insn, _f16) (a);		\
-+  }						\
-+  float16x8_t					\
-+  MSTRCAT (test_##insn, _16x8) (float16x8_t a)	\
-+  {						\
-+    return MSTRCAT (insn, q_f16) (a);		\
-+  }
-+
-+#define BINOP_TEST(insn)					\
-+  float16x4_t							\
-+  MSTRCAT (test_##insn, _16x4) (float16x4_t a, float16x4_t b)	\
-+  {								\
-+    return MSTRCAT (insn, _f16) (a, b);				\
-+  }								\
-+  float16x8_t							\
-+  MSTRCAT (test_##insn, _16x8) (float16x8_t a, float16x8_t b)	\
-+  {								\
-+    return MSTRCAT (insn, q_f16) (a, b);			\
-+  }
-+
-+#define BINOP_LANE_TEST(insn, I)					\
-+  float16x4_t								\
-+  MSTRCAT (test_##insn##_lane, _16x4) (float16x4_t a, float16x4_t b)	\
-+  {									\
-+    return MSTRCAT (insn, _lane_f16) (a, b, I);				\
-+  }									\
-+  float16x8_t								\
-+  MSTRCAT (test_##insn##_lane, _16x8) (float16x8_t a, float16x4_t b)	\
-+  {									\
-+    return MSTRCAT (insn, q_lane_f16) (a, b, I);			\
-+  }
-+
-+#define BINOP_LANEQ_TEST(insn, I)					\
-+  float16x4_t								\
-+  MSTRCAT (test_##insn##_laneq, _16x4) (float16x4_t a, float16x8_t b)	\
-+  {									\
-+    return MSTRCAT (insn, _laneq_f16) (a, b, I);			\
-+  }									\
-+  float16x8_t								\
-+  MSTRCAT (test_##insn##_laneq, _16x8) (float16x8_t a, float16x8_t b)	\
-+  {									\
-+    return MSTRCAT (insn, q_laneq_f16) (a, b, I);			\
-+  }									\
-+
-+#define BINOP_N_TEST(insn)					\
-+  float16x4_t							\
-+  MSTRCAT (test_##insn##_n, _16x4) (float16x4_t a, float16_t b)	\
-+  {								\
-+    return MSTRCAT (insn, _n_f16) (a, b);			\
-+  }								\
-+  float16x8_t							\
-+  MSTRCAT (test_##insn##_n, _16x8) (float16x8_t a, float16_t b)	\
-+  {								\
-+    return MSTRCAT (insn, q_n_f16) (a, b);			\
-+  }
-+
-+#define TERNOP_TEST(insn)						\
-+  float16_t								\
-+  MSTRCAT (test_##insn, _16) (float16_t a, float16_t b, float16_t c)	\
-+  {									\
-+    return MSTRCAT (insn, h_f16) (a, b, c);				\
-+  }									\
-+  float16x4_t								\
-+  MSTRCAT (test_##insn, _16x4) (float16x4_t a, float16x4_t b,		\
-+			       float16x4_t c)				\
-+  {									\
-+    return MSTRCAT (insn, _f16) (a, b, c);				\
-+  }									\
-+  float16x8_t								\
-+  MSTRCAT (test_##insn, _16x8) (float16x8_t a, float16x8_t b,		\
-+			       float16x8_t c)				\
-+  {									\
-+    return MSTRCAT (insn, q_f16) (a, b, c);				\
-+  }
-+
-+#define VCMP1_TEST(insn)			\
-+  uint16x4_t					\
-+  MSTRCAT (test_##insn, _16x4) (float16x4_t a)	\
-+  {						\
-+    return MSTRCAT (insn, _f16) (a);		\
-+  }						\
-+  uint16x8_t					\
-+  MSTRCAT (test_##insn, _16x8) (float16x8_t a)	\
-+  {						\
-+    return MSTRCAT (insn, q_f16) (a);		\
-+  }
-+
-+#define VCMP2_TEST(insn)					\
-+  uint16x4_t							\
-+  MSTRCAT (test_##insn, _16x4) (float16x4_t a, float16x4_t b)	\
-+  {								\
-+    return MSTRCAT (insn, _f16) (a, b);				\
-+  }								\
-+  uint16x8_t							\
-+  MSTRCAT (test_##insn, _16x8) (float16x8_t a, float16x8_t b)	\
-+  {								\
-+    return MSTRCAT (insn, q_f16) (a, b);			\
-+  }
-+
-+#define VCVT_TEST(insn, TY, TO, FR)			\
-+  MSTRCAT (TO, 16x4_t)					\
-+  MSTRCAT (test_##insn, TY) (MSTRCAT (FR, 16x4_t) a)	\
-+  {							\
-+    return MSTRCAT (insn, TY) (a);			\
-+  }							\
-+  MSTRCAT (TO, 16x8_t)					\
-+  MSTRCAT (test_##insn##_q, TY) (MSTRCAT (FR, 16x8_t) a)	\
-+  {							\
-+    return MSTRCAT (insn, q##TY) (a);			\
-+  }
-+
-+#define VCVT_N_TEST(insn, TY, TO, FR)			\
-+  MSTRCAT (TO, 16x4_t)					\
-+  MSTRCAT (test_##insn##_n, TY) (MSTRCAT (FR, 16x4_t) a)	\
-+  {							\
-+    return MSTRCAT (insn, _n##TY) (a, 1);		\
-+  }							\
-+  MSTRCAT (TO, 16x8_t)					\
-+  MSTRCAT (test_##insn##_n_q, TY) (MSTRCAT (FR, 16x8_t) a)	\
-+  {							\
-+    return MSTRCAT (insn, q_n##TY) (a, 1);		\
-+  }
-+
-+VCMP1_TEST (vceqz)
-+/* { dg-final { scan-assembler-times {vceq\.f16\td[0-9]+, d[0-0]+, #0} 1 } }  */
-+/* { dg-final { scan-assembler-times {vceq\.f16\tq[0-9]+, q[0-9]+, #0} 1 } }  */
-+
-+VCMP1_TEST (vcgtz)
-+/* { dg-final { scan-assembler-times {vcgt\.f16\td[0-9]+, d[0-9]+, #0} 1 } }  */
-+/* { dg-final { scan-assembler-times {vceq\.f16\tq[0-9]+, q[0-9]+, #0} 1 } }  */
-+
-+VCMP1_TEST (vcgez)
-+/* { dg-final { scan-assembler-times {vcge\.f16\td[0-9]+, d[0-9]+, #0} 1 } }  */
-+/* { dg-final { scan-assembler-times {vcge\.f16\tq[0-9]+, q[0-9]+, #0} 1 } }  */
-+
-+VCMP1_TEST (vcltz)
-+/* { dg-final { scan-assembler-times {vclt.f16\td[0-9]+, d[0-9]+, #0} 1 } }  */
-+/* { dg-final { scan-assembler-times {vclt.f16\tq[0-9]+, q[0-9]+, #0} 1 } }  */
-+
-+VCMP1_TEST (vclez)
-+/* { dg-final { scan-assembler-times {vcle\.f16\td[0-9]+, d[0-9]+, #0} 1 } }  */
-+/* { dg-final { scan-assembler-times {vcle\.f16\tq[0-9]+, q[0-9]+, #0} 1 } }  */
-+
-+VCVT_TEST (vcvt, _f16_s16, float, int)
-+VCVT_N_TEST (vcvt, _f16_s16, float, int)
-+/* { dg-final { scan-assembler-times {vcvt\.f16\.s16\td[0-9]+, d[0-9]+} 2 } }
-+   { dg-final { scan-assembler-times {vcvt\.f16\.s16\tq[0-9]+, q[0-9]+} 2 } }
-+   { dg-final { scan-assembler-times {vcvt\.f16\.s16\td[0-9]+, d[0-9]+, #1} 1 } }
-+   { dg-final { scan-assembler-times {vcvt\.f16\.s16\tq[0-9]+, q[0-9]+, #1} 1 } }  */
-+
-+VCVT_TEST (vcvt, _f16_u16, float, uint)
-+VCVT_N_TEST (vcvt, _f16_u16, float, uint)
-+/* { dg-final { scan-assembler-times {vcvt\.f16\.u16\td[0-9]+, d[0-9]+} 2 } }
-+   { dg-final { scan-assembler-times {vcvt\.f16\.u16\tq[0-9]+, q[0-9]+} 2 } }
-+   { dg-final { scan-assembler-times {vcvt\.f16\.u16\td[0-9]+, d[0-9]+, #1} 1 } }
-+   { dg-final { scan-assembler-times {vcvt\.f16\.u16\tq[0-9]+, q[0-9]+, #1} 1 } }  */
-+
-+VCVT_TEST (vcvt, _s16_f16, int, float)
-+VCVT_N_TEST (vcvt, _s16_f16, int, float)
-+/* { dg-final { scan-assembler-times {vcvt\.s16\.f16\td[0-9]+, d[0-9]+} 2 } }
-+   { dg-final { scan-assembler-times {vcvt\.s16\.f16\tq[0-9]+, q[0-9]+} 2 } }
-+   { dg-final { scan-assembler-times {vcvt\.s16\.f16\td[0-9]+, d[0-9]+, #1} 1 } }
-+   { dg-final { scan-assembler-times {vcvt\.s16\.f16\tq[0-9]+, q[0-9]+, #1} 1 } }  */
-+
-+VCVT_TEST (vcvt, _u16_f16, uint, float)
-+VCVT_N_TEST (vcvt, _u16_f16, uint, float)
-+/* { dg-final { scan-assembler-times {vcvt\.u16\.f16\td[0-9]+, d[0-9]+} 2 } }
-+   { dg-final { scan-assembler-times {vcvt\.u16\.f16\tq[0-9]+, q[0-9]+} 2 } }
-+   { dg-final { scan-assembler-times {vcvt\.u16\.f16\td[0-9]+, d[0-9]+, #1} 1 } }
-+   { dg-final { scan-assembler-times {vcvt\.u16\.f16\tq[0-9]+, q[0-9]+, #1} 1 } }  */
-+
-+VCVT_TEST (vcvta, _s16_f16, int, float)
-+/* { dg-final { scan-assembler-times {vcvta\.s16\.f16\td[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vcvta\.s16\.f16\tq[0-9]+, q[0-9]+} 1 } }
-+*/
-+
-+VCVT_TEST (vcvta, _u16_f16, uint, float)
-+/* { dg-final { scan-assembler-times {vcvta\.u16\.f16\td[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vcvta\.u16\.f16\tq[0-9]+, q[0-9]+} 1 } }
-+*/
-+
-+VCVT_TEST (vcvtm, _s16_f16, int, float)
-+/* { dg-final { scan-assembler-times {vcvtm\.s16\.f16\td[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vcvtm\.s16\.f16\tq[0-9]+, q[0-9]+} 1 } }
-+*/
-+
-+VCVT_TEST (vcvtm, _u16_f16, uint, float)
-+/* { dg-final { scan-assembler-times {vcvtm\.u16\.f16\td[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vcvtm\.u16\.f16\tq[0-9]+, q[0-9]+} 1 } }
-+*/
-+
-+VCVT_TEST (vcvtn, _s16_f16, int, float)
-+/* { dg-final { scan-assembler-times {vcvtn\.s16\.f16\td[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vcvtn\.s16\.f16\tq[0-9]+, q[0-9]+} 1 } }
-+*/
-+
-+VCVT_TEST (vcvtn, _u16_f16, uint, float)
-+/* { dg-final { scan-assembler-times {vcvtn\.u16\.f16\td[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vcvtn\.u16\.f16\tq[0-9]+, q[0-9]+} 1 } }
-+*/
-+
-+VCVT_TEST (vcvtp, _s16_f16, int, float)
-+/* { dg-final { scan-assembler-times {vcvtp\.s16\.f16\td[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vcvtp\.s16\.f16\tq[0-9]+, q[0-9]+} 1 } }
-+*/
-+
-+VCVT_TEST (vcvtp, _u16_f16, uint, float)
-+/* { dg-final { scan-assembler-times {vcvtp\.u16\.f16\td[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vcvtp\.u16\.f16\tq[0-9]+, q[0-9]+} 1 } }
-+*/
-+
-+UNOP_TEST (vabs)
-+/* { dg-final { scan-assembler-times {vabs\.f16\td[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vabs\.f16\tq[0-9]+, q[0-9]+} 1 } }  */
-+
-+UNOP_TEST (vneg)
-+/* { dg-final { scan-assembler-times {vneg\.f16\td[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vneg\.f16\tq[0-9]+, q[0-9]+} 1 } }  */
-+
-+UNOP_TEST (vrecpe)
-+/* { dg-final { scan-assembler-times {vrecpe\.f16\td[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vrecpe\.f16\tq[0-9]+, q[0-9]+} 1 } }  */
-+
-+UNOP_TEST (vrnd)
-+/* { dg-final { scan-assembler-times {vrintz\.f16\td[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vrintz\.f16\tq[0-9]+, q[0-9]+} 1 } }  */
-+
-+UNOP_TEST (vrnda)
-+/* { dg-final { scan-assembler-times {vrinta\.f16\td[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vrinta\.f16\tq[0-9]+, q[0-9]+} 1 } }  */
-+
-+UNOP_TEST (vrndm)
-+/* { dg-final { scan-assembler-times {vrintm\.f16\td[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vrintm\.f16\tq[0-9]+, q[0-9]+} 1 } }  */
-+
-+UNOP_TEST (vrndn)
-+/* { dg-final { scan-assembler-times {vrintn\.f16\td[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vrintn\.f16\tq[0-9]+, q[0-9]+} 1 } }  */
-+
-+UNOP_TEST (vrndp)
-+/* { dg-final { scan-assembler-times {vrintp\.f16\td[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vrintp\.f16\tq[0-9]+, q[0-9]+} 1 } }  */
-+
-+UNOP_TEST (vrndx)
-+/* { dg-final { scan-assembler-times {vrintx\.f16\td[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vrintx\.f16\tq[0-9]+, q[0-9]+} 1 } }  */
-+
-+UNOP_TEST (vrsqrte)
-+/* { dg-final { scan-assembler-times {vrsqrte\.f16\td[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vrsqrte\.f16\tq[0-9]+, q[0-9]+} 1 } }  */
-+
-+BINOP_TEST (vadd)
-+/* { dg-final { scan-assembler-times {vadd\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vadd\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
-+
-+BINOP_TEST (vabd)
-+/* { dg-final { scan-assembler-times {vabd\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vabd\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
-+
-+VCMP2_TEST (vcage)
-+/* { dg-final { scan-assembler-times {vacge\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vacge\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
-+
-+VCMP2_TEST (vcagt)
-+/* { dg-final { scan-assembler-times {vacgt\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vacgt\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
-+
-+VCMP2_TEST (vcale)
-+/* { dg-final { scan-assembler-times {vacle\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vacle\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
-+
-+VCMP2_TEST (vcalt)
-+/* { dg-final { scan-assembler-times {vaclt\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vaclt\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
-+
-+VCMP2_TEST (vceq)
-+/* { dg-final { scan-assembler-times {vceq\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vceq\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
-+
-+VCMP2_TEST (vcge)
-+/* { dg-final { scan-assembler-times {vcge\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vcge\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
-+
-+VCMP2_TEST (vcgt)
-+/* { dg-final { scan-assembler-times {vcgt\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vcgt\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
-+
-+VCMP2_TEST (vcle)
-+/* { dg-final { scan-assembler-times {vcle\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vcle\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
-+
-+VCMP2_TEST (vclt)
-+/* { dg-final { scan-assembler-times {vclt\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vclt\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
-+
-+BINOP_TEST (vmax)
-+/* { dg-final { scan-assembler-times {vmax\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vmax\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
-+
-+BINOP_TEST (vmin)
-+/* { dg-final { scan-assembler-times {vmin\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vmin\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
-+
-+BINOP_TEST (vmaxnm)
-+/* { dg-final { scan-assembler-times {vmaxnm\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
-+  { dg-final { scan-assembler-times {vmaxnm\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
-+
-+BINOP_TEST (vminnm)
-+/* { dg-final { scan-assembler-times {vminnm\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
-+  { dg-final { scan-assembler-times {vminnm\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
-+
-+BINOP_TEST (vmul)
-+/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 3 } }
-+   { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
-+BINOP_LANE_TEST (vmul, 2)
-+/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+\[2\]} 1 } }
-+   { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, d[0-9]+\[2\]} 1 } }  */
-+BINOP_N_TEST (vmul)
-+/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+\[0\]} 1 } }
-+   { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, d[0-9]+\[0\]} 1 } }*/
-+
-+float16x4_t
-+test_vpadd_16x4 (float16x4_t a, float16x4_t b)
-+{
-+  return vpadd_f16 (a, b);
-+}
-+/* { dg-final { scan-assembler-times {vpadd\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } */
-+
-+float16x4_t
-+test_vpmax_16x4 (float16x4_t a, float16x4_t b)
-+{
-+  return vpmax_f16 (a, b);
-+}
-+/* { dg-final { scan-assembler-times {vpmax\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } */
-+
-+float16x4_t
-+test_vpmin_16x4 (float16x4_t a, float16x4_t b)
-+{
-+  return vpmin_f16 (a, b);
-+}
-+/* { dg-final { scan-assembler-times {vpmin\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } */
-+
-+BINOP_TEST (vsub)
-+/* { dg-final { scan-assembler-times {vsub\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vsub\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
-+
-+BINOP_TEST (vrecps)
-+/* { dg-final { scan-assembler-times {vrecps\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
-+  { dg-final { scan-assembler-times {vrecps\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
-+
-+BINOP_TEST (vrsqrts)
-+/* { dg-final { scan-assembler-times {vrsqrts\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
-+  { dg-final { scan-assembler-times {vrsqrts\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
-+
-+TERNOP_TEST (vfma)
-+/* { dg-final { scan-assembler-times {vfma\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
-+  { dg-final { scan-assembler-times {vfma\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
-+
-+TERNOP_TEST (vfms)
-+/* { dg-final { scan-assembler-times {vfms\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }
-+  { dg-final { scan-assembler-times {vfms\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
-+
-+float16x4_t
-+test_vmov_n_f16 (float16_t a)
-+{
-+  return vmov_n_f16 (a);
-+}
-+
-+float16x4_t
-+test_vdup_n_f16 (float16_t a)
-+{
-+  return vdup_n_f16 (a);
-+}
-+/* { dg-final { scan-assembler-times {vdup\.16\td[0-9]+, r[0-9]+} 2 } }  */
-+
-+float16x8_t
-+test_vmovq_n_f16 (float16_t a)
-+{
-+  return vmovq_n_f16 (a);
-+}
-+
-+float16x8_t
-+test_vdupq_n_f16 (float16_t a)
-+{
-+  return vdupq_n_f16 (a);
-+}
-+/* { dg-final { scan-assembler-times {vdup\.16\tq[0-9]+, r[0-9]+} 2 } }  */
-+
-+float16x4_t
-+test_vdup_lane_f16 (float16x4_t a)
-+{
-+  return vdup_lane_f16 (a, 1);
-+}
-+/* { dg-final { scan-assembler-times {vdup\.16\td[0-9]+, d[0-9]+\[1\]} 1 } }  */
-+
-+float16x8_t
-+test_vdupq_lane_f16 (float16x4_t a)
-+{
-+  return vdupq_lane_f16 (a, 1);
-+}
-+/* { dg-final { scan-assembler-times {vdup\.16\tq[0-9]+, d[0-9]+\[1\]} 1 } }  */
-+
-+float16x4_t
-+test_vext_f16 (float16x4_t a, float16x4_t b)
-+{
-+  return vext_f16 (a, b, 1);
-+}
-+/* { dg-final { scan-assembler-times {vext\.16\td[0-9]+, d[0-9]+, d[0-9]+, #1} 1 } } */
-+
-+float16x8_t
-+test_vextq_f16 (float16x8_t a, float16x8_t b)
-+{
-+  return vextq_f16 (a, b, 1);
-+}
-+/*   { dg-final { scan-assembler-times {vext\.16\tq[0-9]+, q[0-9]+, q[0-9]+, #1} 1 } }  */
-+
-+UNOP_TEST (vrev64)
-+/* { dg-final { scan-assembler-times {vrev64\.16\td[0-9]+, d[0-9]+} 1 } }
-+   { dg-final { scan-assembler-times {vrev64\.16\tq[0-9]+, q[0-9]+} 1 } }  */
-+
-+float16x4_t
-+test_vbsl16x4 (uint16x4_t a, float16x4_t b, float16x4_t c)
-+{
-+  return vbsl_f16 (a, b, c);
-+}
-+/* { dg-final { scan-assembler-times {vbsl\td[0-9]+, d[0-9]+, d[0-9]+} 1 } }  */
-+
-+float16x8_t
-+test_vbslq16x8 (uint16x8_t a, float16x8_t b, float16x8_t c)
-+{
-+  return vbslq_f16 (a, b, c);
-+}
-+/*{ dg-final { scan-assembler-times {vbsl\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
-+
-+float16x4x2_t
-+test_vzip16x4 (float16x4_t a, float16x4_t b)
-+{
-+  return vzip_f16 (a, b);
-+}
-+/* { dg-final { scan-assembler-times {vzip\.16\td[0-9]+, d[0-9]+} 1 } }  */
-+
-+float16x8x2_t
-+test_vzipq16x8 (float16x8_t a, float16x8_t b)
-+{
-+  return vzipq_f16 (a, b);
-+}
-+/*{ dg-final { scan-assembler-times {vzip\.16\tq[0-9]+, q[0-9]+} 1 } }  */
-+
-+float16x4x2_t
-+test_vuzp16x4 (float16x4_t a, float16x4_t b)
-+{
-+  return vuzp_f16 (a, b);
-+}
-+/* { dg-final { scan-assembler-times {vuzp\.16\td[0-9]+, d[0-9]+} 1 } }  */
-+
-+float16x8x2_t
-+test_vuzpq16x8 (float16x8_t a, float16x8_t b)
-+{
-+  return vuzpq_f16 (a, b);
-+}
-+/*{ dg-final { scan-assembler-times {vuzp\.16\tq[0-9]+, q[0-9]+} 1 } }  */
-+
-+float16x4x2_t
-+test_vtrn16x4 (float16x4_t a, float16x4_t b)
-+{
-+  return vtrn_f16 (a, b);
-+}
-+/* { dg-final { scan-assembler-times {vtrn\.16\td[0-9]+, d[0-9]+} 1 } }  */
-+
-+float16x8x2_t
-+test_vtrnq16x8 (float16x8_t a, float16x8_t b)
-+{
-+  return vtrnq_f16 (a, b);
-+}
-+/*{ dg-final { scan-assembler-times {vtrn\.16\tq[0-9]+, q[0-9]+} 1 } }  */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/armv8_2-fp16-scalar-1.c
-@@ -0,0 +1,203 @@
-+/* { dg-do compile }  */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok }  */
-+/* { dg-options "-O2" }  */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+/* Test instructions generated for the FP16 scalar intrinsics.  */
-+#include <arm_fp16.h>
-+
-+#define MSTRCAT(L, str)	L##str
-+
-+#define UNOP_TEST(insn)				\
-+  float16_t					\
-+  MSTRCAT (test_##insn, 16) (float16_t a)	\
-+  {						\
-+    return MSTRCAT (insn, h_f16) (a);		\
-+  }
-+
-+#define BINOP_TEST(insn)				\
-+  float16_t						\
-+  MSTRCAT (test_##insn, 16) (float16_t a, float16_t b)	\
-+  {							\
-+    return MSTRCAT (insn, h_f16) (a, b);		\
-+  }
-+
-+#define TERNOP_TEST(insn)						\
-+  float16_t								\
-+  MSTRCAT (test_##insn, 16) (float16_t a, float16_t b, float16_t c)	\
-+  {									\
-+    return MSTRCAT (insn, h_f16) (a, b, c);				\
-+  }
-+
-+float16_t
-+test_vcvth_f16_s32 (int32_t a)
-+{
-+  return vcvth_f16_s32 (a);
-+}
-+
-+float16_t
-+test_vcvth_n_f16_s32 (int32_t a)
-+{
-+  return vcvth_n_f16_s32 (a, 1);
-+}
-+/* { dg-final { scan-assembler-times {vcvt\.f16\.s32\ts[0-9]+, s[0-9]+} 2 } }  */
-+/* { dg-final { scan-assembler-times {vcvt\.f16\.s32\ts[0-9]+, s[0-9]+, #1} 1 } }  */
-+
-+float16_t
-+test_vcvth_f16_u32 (uint32_t a)
-+{
-+  return vcvth_f16_u32 (a);
-+}
-+
-+float16_t
-+test_vcvth_n_f16_u32 (uint32_t a)
-+{
-+  return vcvth_n_f16_u32 (a, 1);
-+}
-+
-+/* { dg-final { scan-assembler-times {vcvt\.f16\.u32\ts[0-9]+, s[0-9]+} 2 } }  */
-+/* { dg-final { scan-assembler-times {vcvt\.f16\.u32\ts[0-9]+, s[0-9]+, #1} 1 } }  */
-+
-+uint32_t
-+test_vcvth_u32_f16 (float16_t a)
-+{
-+  return vcvth_u32_f16 (a);
-+}
-+/* { dg-final { scan-assembler-times {vcvt\.u32\.f16\ts[0-9]+, s[0-9]+} 2 } }  */
-+
-+uint32_t
-+test_vcvth_n_u32_f16 (float16_t a)
-+{
-+  return vcvth_n_u32_f16 (a, 1);
-+}
-+/* { dg-final { scan-assembler-times {vcvt\.u32\.f16\ts[0-9]+, s[0-9]+, #1} 1 } }  */
-+
-+int32_t
-+test_vcvth_s32_f16 (float16_t a)
-+{
-+  return vcvth_s32_f16 (a);
-+}
-+
-+int32_t
-+test_vcvth_n_s32_f16 (float16_t a)
-+{
-+  return vcvth_n_s32_f16 (a, 1);
-+}
-+
-+/* { dg-final { scan-assembler-times {vcvt\.s32\.f16\ts[0-9]+, s[0-9]+} 2 } }  */
-+/* { dg-final { scan-assembler-times {vcvt\.s32\.f16\ts[0-9]+, s[0-9]+, #1} 1 } }  */
-+
-+int32_t
-+test_vcvtah_s32_f16 (float16_t a)
-+{
-+  return vcvtah_s32_f16 (a);
-+}
-+/* { dg-final { scan-assembler-times {vcvta\.s32\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
-+
-+uint32_t
-+test_vcvtah_u32_f16 (float16_t a)
-+{
-+  return vcvtah_u32_f16 (a);
-+}
-+/* { dg-final { scan-assembler-times {vcvta\.u32\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
-+
-+int32_t
-+test_vcvtmh_s32_f16 (float16_t a)
-+{
-+  return vcvtmh_s32_f16 (a);
-+}
-+/* { dg-final { scan-assembler-times {vcvtm\.s32\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
-+
-+uint32_t
-+test_vcvtmh_u32_f16 (float16_t a)
-+{
-+  return vcvtmh_u32_f16 (a);
-+}
-+/* { dg-final { scan-assembler-times {vcvtm\.u32\.f16\ts[0-9]+, s[0-9]+} 1 } }
-+ */
-+
-+int32_t
-+test_vcvtnh_s32_f16 (float16_t a)
-+{
-+  return vcvtnh_s32_f16 (a);
-+}
-+/* { dg-final { scan-assembler-times {vcvtn\.s32\.f16\ts[0-9]+, s[0-9]+} 1 } }
-+ */
-+
-+uint32_t
-+test_vcvtnh_u32_f16 (float16_t a)
-+{
-+  return vcvtnh_u32_f16 (a);
-+}
-+/* { dg-final { scan-assembler-times {vcvtn\.u32\.f16\ts[0-9]+, s[0-9]+} 1 } }
-+ */
-+
-+int32_t
-+test_vcvtph_s32_f16 (float16_t a)
-+{
-+  return vcvtph_s32_f16 (a);
-+}
-+/* { dg-final { scan-assembler-times {vcvtp\.s32\.f16\ts[0-9]+, s[0-9]+} 1 } }
-+ */
-+
-+uint32_t
-+test_vcvtph_u32_f16 (float16_t a)
-+{
-+  return vcvtph_u32_f16 (a);
-+}
-+/* { dg-final { scan-assembler-times {vcvtp\.u32\.f16\ts[0-9]+, s[0-9]+} 1 } }
-+ */
-+
-+UNOP_TEST (vabs)
-+/* { dg-final { scan-assembler-times {vabs\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
-+
-+UNOP_TEST (vneg)
-+/* { dg-final { scan-assembler-times {vneg\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
-+
-+UNOP_TEST (vrnd)
-+/* { dg-final { scan-assembler-times {vrintz\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
-+
-+UNOP_TEST (vrndi)
-+/* { dg-final { scan-assembler-times {vrintr\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
-+
-+UNOP_TEST (vrnda)
-+/* { dg-final { scan-assembler-times {vrinta\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
-+
-+UNOP_TEST (vrndm)
-+/* { dg-final { scan-assembler-times {vrinta\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
-+
-+UNOP_TEST (vrndn)
-+/* { dg-final { scan-assembler-times {vrinta\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
-+
-+UNOP_TEST (vrndp)
-+/* { dg-final { scan-assembler-times {vrinta\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
-+
-+UNOP_TEST (vrndx)
-+/* { dg-final { scan-assembler-times {vrinta\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
-+
-+UNOP_TEST (vsqrt)
-+/* { dg-final { scan-assembler-times {vsqrt\.f16\ts[0-9]+, s[0-9]+} 1 } }  */
-+
-+BINOP_TEST (vadd)
-+/* { dg-final { scan-assembler-times {vadd\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } }  */
-+
-+BINOP_TEST (vdiv)
-+/* { dg-final { scan-assembler-times {vdiv\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } }  */
-+
-+BINOP_TEST (vmaxnm)
-+/* { dg-final { scan-assembler-times {vmaxnm\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } }  */
-+
-+BINOP_TEST (vminnm)
-+/* { dg-final { scan-assembler-times {vminnm\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } }  */
-+
-+BINOP_TEST (vmul)
-+/* { dg-final { scan-assembler-times {vmul\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } }  */
-+
-+BINOP_TEST (vsub)
-+/* { dg-final { scan-assembler-times {vsub\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } }  */
-+
-+TERNOP_TEST (vfma)
-+/* { dg-final { scan-assembler-times {vfma\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } }  */
-+
-+TERNOP_TEST (vfms)
-+/* { dg-final { scan-assembler-times {vfms\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } }  */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/armv8_2-fp16-scalar-2.c
-@@ -0,0 +1,71 @@
-+/* { dg-do compile }  */
-+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok }  */
-+/* { dg-options "-O2 -std=c11" }  */
-+/* { dg-add-options arm_v8_2a_fp16_scalar }  */
-+
-+/* Test compiler use of FP16 instructions.  */
-+#include <arm_fp16.h>
-+
-+float16_t
-+test_mov_imm_1 (float16_t a)
-+{
-+  return 1.0;
-+}
-+
-+float16_t
-+test_mov_imm_2 (float16_t a)
-+{
-+  float16_t b = 1.0;
-+  return b;
-+}
-+
-+float16_t
-+test_vmov_imm_3 (float16_t a)
-+{
-+  float16_t b = 1.0;
-+  return vaddh_f16 (a, b);
-+}
-+
-+float16_t
-+test_vmov_imm_4 (float16_t a)
-+{
-+  return vaddh_f16 (a, 1.0);
-+}
-+
-+/* { dg-final { scan-assembler-times {vmov.f16\ts[0-9]+, #1\.0e\+0} 4 } }
-+   { dg-final { scan-assembler-times {vadd.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 2 } } */
-+
-+float16_t
-+test_vmla_1 (float16_t a, float16_t b, float16_t c)
-+{
-+  return vaddh_f16 (vmulh_f16 (a, b), c);
-+}
-+/* { dg-final { scan-assembler-times {vmla\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } }  */
-+
-+float16_t
-+test_vmla_2 (float16_t a, float16_t b, float16_t c)
-+{
-+  return vsubh_f16 (vmulh_f16 (vnegh_f16 (a), b), c);
-+}
-+/* { dg-final { scan-assembler-times {vnmla\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } } */
-+
-+float16_t
-+test_vmls_1 (float16_t a, float16_t b, float16_t c)
-+{
-+  return vsubh_f16 (c, vmulh_f16 (a, b));
-+}
-+
-+float16_t
-+test_vmls_2 (float16_t a, float16_t b, float16_t c)
-+{
-+  return vsubh_f16 (a, vmulh_f16 (b, c));
-+}
-+/* { dg-final { scan-assembler-times {vmls\.f16} 2 } } */
-+
-+float16_t
-+test_vnmls_1 (float16_t a, float16_t b, float16_t c)
-+{
-+  return vsubh_f16 (vmulh_f16 (a, b), c);
-+}
-+/* { dg-final { scan-assembler-times {vnmls\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } } */
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-comp-swap-release-acquire-1.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8a_ok } */
-+/* { dg-options "-O2 -fno-ipa-icf" } */
-+/* { dg-add-options arm_arch_v8a } */
-+
-+#include "../aarch64/atomic-comp-swap-release-acquire.x"
-+
-+/* { dg-final { scan-assembler-times "ldaex" 4 } } */
-+/* { dg-final { scan-assembler-times "stlex" 4 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-comp-swap-release-acquire-2.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-options "-O2 -fno-ipa-icf" } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+
-+#include "../aarch64/atomic-comp-swap-release-acquire.x"
-+
-+/* { dg-final { scan-assembler-times "ldaex" 4 } } */
-+/* { dg-final { scan-assembler-times "stlex" 4 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-comp-swap-release-acquire-3.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
-+/* { dg-options "-O2 -fno-ipa-icf" } */
-+/* { dg-add-options arm_arch_v8m_base } */
-+
-+#include "../aarch64/atomic-comp-swap-release-acquire.x"
-+
-+/* { dg-final { scan-assembler-times "ldaex" 4 } } */
-+/* { dg-final { scan-assembler-times "stlex" 4 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- a/src/gcc/testsuite/gcc.target/arm/atomic-comp-swap-release-acquire.c
-+++ b/src//dev/null
-@@ -1,10 +0,0 @@
--/* { dg-do compile } */
--/* { dg-require-effective-target arm_arch_v8a_ok } */
--/* { dg-options "-O2 -fno-ipa-icf" } */
--/* { dg-add-options arm_arch_v8a } */
--
--#include "../aarch64/atomic-comp-swap-release-acquire.x"
--
--/* { dg-final { scan-assembler-times "ldaex" 4 } } */
--/* { dg-final { scan-assembler-times "stlex" 4 } } */
--/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-acq_rel-1.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8a_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8a } */
-+
-+#include "../aarch64/atomic-op-acq_rel.x"
-+
-+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-acq_rel-2.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+
-+#include "../aarch64/atomic-op-acq_rel.x"
-+
-+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-acq_rel-3.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8m_base } */
-+
-+#include "../aarch64/atomic-op-acq_rel.x"
-+
-+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- a/src/gcc/testsuite/gcc.target/arm/atomic-op-acq_rel.c
-+++ b/src//dev/null
-@@ -1,10 +0,0 @@
--/* { dg-do compile } */
--/* { dg-require-effective-target arm_arch_v8a_ok } */
--/* { dg-options "-O2" } */
--/* { dg-add-options arm_arch_v8a } */
--
--#include "../aarch64/atomic-op-acq_rel.x"
--
--/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-acquire-1.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8a_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8a } */
-+
-+#include "../aarch64/atomic-op-acquire.x"
-+
-+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-acquire-2.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+
-+#include "../aarch64/atomic-op-acquire.x"
-+
-+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-acquire-3.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8m_base } */
-+
-+#include "../aarch64/atomic-op-acquire.x"
-+
-+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- a/src/gcc/testsuite/gcc.target/arm/atomic-op-acquire.c
-+++ b/src//dev/null
-@@ -1,10 +0,0 @@
--/* { dg-do compile } */
--/* { dg-require-effective-target arm_arch_v8a_ok } */
--/* { dg-options "-O2" } */
--/* { dg-add-options arm_arch_v8a } */
--
--#include "../aarch64/atomic-op-acquire.x"
--
--/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-char-1.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8a_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8a } */
-+
-+#include "../aarch64/atomic-op-char.x"
-+
-+/* { dg-final { scan-assembler-times "ldrexb\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strexb\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-char-2.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+
-+#include "../aarch64/atomic-op-char.x"
-+
-+/* { dg-final { scan-assembler-times "ldrexb\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strexb\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-char-3.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8m_base } */
-+
-+#include "../aarch64/atomic-op-char.x"
-+
-+/* { dg-final { scan-assembler-times "ldrexb\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strexb\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- a/src/gcc/testsuite/gcc.target/arm/atomic-op-char.c
-+++ b/src//dev/null
-@@ -1,10 +0,0 @@
--/* { dg-do compile } */
--/* { dg-require-effective-target arm_arch_v8a_ok } */
--/* { dg-options "-O2" } */
--/* { dg-add-options arm_arch_v8a } */
--
--#include "../aarch64/atomic-op-char.x"
--
--/* { dg-final { scan-assembler-times "ldrexb\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-times "strexb\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-consume-1.c
-@@ -0,0 +1,11 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8a_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8a } */
-+
-+#include "../aarch64/atomic-op-consume.x"
-+
-+/* Scan for ldaex is a PR59448 consume workaround.  */
-+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-consume-2.c
-@@ -0,0 +1,11 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+
-+#include "../aarch64/atomic-op-consume.x"
-+
-+/* Scan for ldaex is a PR59448 consume workaround.  */
-+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-consume-3.c
-@@ -0,0 +1,11 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8m_base } */
-+
-+#include "../aarch64/atomic-op-consume.x"
-+
-+/* Scan for ldaex is a PR59448 consume workaround.  */
-+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- a/src/gcc/testsuite/gcc.target/arm/atomic-op-consume.c
-+++ b/src//dev/null
-@@ -1,11 +0,0 @@
--/* { dg-do compile } */
--/* { dg-require-effective-target arm_arch_v8a_ok } */
--/* { dg-options "-O2" } */
--/* { dg-add-options arm_arch_v8a } */
--
--#include "../aarch64/atomic-op-consume.x"
--
--/* Scan for ldaex is a PR59448 consume workaround.  */
--/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-int-1.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8a_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8a } */
-+
-+#include "../aarch64/atomic-op-int.x"
-+
-+/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-int-2.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+
-+#include "../aarch64/atomic-op-int.x"
-+
-+/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-int-3.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8m_base } */
-+
-+#include "../aarch64/atomic-op-int.x"
-+
-+/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- a/src/gcc/testsuite/gcc.target/arm/atomic-op-int.c
-+++ b/src//dev/null
-@@ -1,10 +0,0 @@
--/* { dg-do compile } */
--/* { dg-require-effective-target arm_arch_v8a_ok } */
--/* { dg-options "-O2" } */
--/* { dg-add-options arm_arch_v8a } */
--
--#include "../aarch64/atomic-op-int.x"
--
--/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-relaxed-1.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8a_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8a } */
-+
-+#include "../aarch64/atomic-op-relaxed.x"
-+
-+/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-relaxed-2.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+
-+#include "../aarch64/atomic-op-relaxed.x"
-+
-+/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-relaxed-3.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8m_base } */
-+
-+#include "../aarch64/atomic-op-relaxed.x"
-+
-+/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- a/src/gcc/testsuite/gcc.target/arm/atomic-op-relaxed.c
-+++ b/src//dev/null
-@@ -1,10 +0,0 @@
--/* { dg-do compile } */
--/* { dg-require-effective-target arm_arch_v8a_ok } */
--/* { dg-options "-O2" } */
--/* { dg-add-options arm_arch_v8a } */
--
--#include "../aarch64/atomic-op-relaxed.x"
--
--/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-times "strex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-release-1.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8a_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8a } */
-+
-+#include "../aarch64/atomic-op-release.x"
-+
-+/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-release-2.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+
-+#include "../aarch64/atomic-op-release.x"
-+
-+/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-release-3.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8m_base } */
-+
-+#include "../aarch64/atomic-op-release.x"
-+
-+/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- a/src/gcc/testsuite/gcc.target/arm/atomic-op-release.c
-+++ b/src//dev/null
-@@ -1,10 +0,0 @@
--/* { dg-do compile } */
--/* { dg-require-effective-target arm_arch_v8a_ok } */
--/* { dg-options "-O2" } */
--/* { dg-add-options arm_arch_v8a } */
--
--#include "../aarch64/atomic-op-release.x"
--
--/* { dg-final { scan-assembler-times "ldrex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-seq_cst-1.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8a_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8a } */
-+
-+#include "../aarch64/atomic-op-seq_cst.x"
-+
-+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-seq_cst-2.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+
-+#include "../aarch64/atomic-op-seq_cst.x"
-+
-+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-seq_cst-3.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8m_base } */
-+
-+#include "../aarch64/atomic-op-seq_cst.x"
-+
-+/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- a/src/gcc/testsuite/gcc.target/arm/atomic-op-seq_cst.c
-+++ b/src//dev/null
-@@ -1,10 +0,0 @@
--/* { dg-do compile } */
--/* { dg-require-effective-target arm_arch_v8a_ok } */
--/* { dg-options "-O2" } */
--/* { dg-add-options arm_arch_v8a } */
--
--#include "../aarch64/atomic-op-seq_cst.x"
--
--/* { dg-final { scan-assembler-times "ldaex\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-times "stlex\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-short-1.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8a_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8a } */
-+
-+#include "../aarch64/atomic-op-short.x"
-+
-+/* { dg-final { scan-assembler-times "ldrexh\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strexh\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-short-2.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+
-+#include "../aarch64/atomic-op-short.x"
-+
-+/* { dg-final { scan-assembler-times "ldrexh\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strexh\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/atomic-op-short-3.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v8m_base } */
-+
-+#include "../aarch64/atomic-op-short.x"
-+
-+/* { dg-final { scan-assembler-times "ldrexh\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-times "strexh\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
-+/* { dg-final { scan-assembler-not "dmb" } } */
---- a/src/gcc/testsuite/gcc.target/arm/atomic-op-short.c
-+++ b/src//dev/null
-@@ -1,10 +0,0 @@
--/* { dg-do compile } */
--/* { dg-require-effective-target arm_arch_v8a_ok } */
--/* { dg-options "-O2" } */
--/* { dg-add-options arm_arch_v8a } */
--
--#include "../aarch64/atomic-op-short.x"
--
--/* { dg-final { scan-assembler-times "ldrexh\tr\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-times "strexh\t...?, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 6 } } */
--/* { dg-final { scan-assembler-not "dmb" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/attr-fp16-arith-1.c
-@@ -0,0 +1,58 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_v8_2a_fp16_scalar } */
-+
-+/* Reset fpu to a value compatible with the next pragmas.  */
-+#pragma GCC target ("fpu=vfp")
-+
-+#pragma GCC push_options
-+#pragma GCC target ("fpu=fp-armv8")
-+
-+#ifndef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
-+#error __ARM_FEATURE_FP16_SCALAR_ARITHMETIC not defined.
-+#endif
-+
-+#pragma GCC push_options
-+#pragma GCC target ("fpu=neon-fp-armv8")
-+
-+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-+#error __ARM_FEATURE_FP16_VECTOR_ARITHMETIC not defined.
-+#endif
-+
-+#ifndef __ARM_NEON
-+#error __ARM_NEON not defined.
-+#endif
-+
-+#if !defined (__ARM_FP) || !(__ARM_FP & 0x2)
-+#error Invalid value for __ARM_FP
-+#endif
-+
-+#include "arm_neon.h"
-+
-+float16_t
-+foo (float16x4_t b)
-+{
-+  float16x4_t a = {2.0, 3.0, 4.0, 5.0};
-+  float16x4_t res = vadd_f16 (a, b);
-+
-+  return res[0];
-+}
-+
-+/* { dg-final { scan-assembler "vadd\\.f16\td\[0-9\]+, d\[0-9\]+" } } */
-+
-+#pragma GCC pop_options
-+
-+/* Check that the FP version is correctly reset to mfpu=fp-armv8.  */
-+
-+#if !defined (__ARM_FP) || !(__ARM_FP & 0x2)
-+#error __ARM_FP should record FP16 support.
-+#endif
-+
-+#pragma GCC pop_options
-+
-+/* Check that the FP version is correctly reset to mfpu=vfp.  */
-+
-+#if !defined (__ARM_FP) || (__ARM_FP & 0x2)
-+#error Unexpected value for __ARM_FP.
-+#endif
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/builtin_saddl.c
-@@ -0,0 +1,17 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2" }  */
-+/* { dg-require-effective-target arm32 } */
-+extern void overflow_handler ();
-+
-+long overflow_add (long x, long y)
-+{
-+  long r;
-+
-+  int ovr = __builtin_saddl_overflow (x, y, &r);
-+  if (ovr)
-+    overflow_handler ();
-+
-+  return r;
-+}
-+
-+/* { dg-final { scan-assembler "adds" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/builtin_saddll.c
-@@ -0,0 +1,18 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2" }  */
-+/* { dg-require-effective-target arm32 } */
-+extern void overflow_handler ();
-+
-+long long overflow_add (long long x, long long y)
-+{
-+  long long r;
-+
-+  int ovr = __builtin_saddll_overflow (x, y, &r);
-+  if (ovr)
-+    overflow_handler ();
-+
-+  return r;
-+}
-+
-+/* { dg-final { scan-assembler "adds" } } */
-+/* { dg-final { scan-assembler "adcs" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/builtin_ssubl.c
-@@ -0,0 +1,17 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2" }  */
-+/* { dg-require-effective-target arm32 } */
-+extern void overflow_handler ();
-+
-+long overflow_sub (long x, long y)
-+{
-+  long r;
-+
-+  int ovr = __builtin_ssubl_overflow (x, y, &r);
-+  if (ovr)
-+    overflow_handler ();
-+
-+  return r;
-+}
-+
-+/* { dg-final { scan-assembler "subs" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/builtin_ssubll.c
-@@ -0,0 +1,18 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2" }  */
-+/* { dg-require-effective-target arm32 } */
-+extern void overflow_handler ();
-+
-+long long overflow_sub (long long x, long long y)
-+{
-+  long long r;
-+
-+  int ovr = __builtin_ssubll_overflow (x, y, &r);
-+  if (ovr)
-+    overflow_handler ();
-+
-+  return r;
-+}
-+
-+/* { dg-final { scan-assembler "subs" } } */
-+/* { dg-final { scan-assembler "sbcs" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/builtin_uaddl.c
-@@ -0,0 +1,17 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2" }  */
-+/* { dg-require-effective-target arm32 } */
-+extern void overflow_handler ();
-+
-+unsigned long overflow_add (unsigned long x, unsigned long y)
-+{
-+  unsigned long r;
-+
-+  int ovr = __builtin_uaddl_overflow (x, y, &r);
-+  if (ovr)
-+    overflow_handler ();
-+
-+  return r;
-+}
-+
-+/* { dg-final { scan-assembler "adds" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/builtin_uaddll.c
-@@ -0,0 +1,18 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2" }  */
-+/* { dg-require-effective-target arm32 } */
-+extern void overflow_handler ();
-+
-+unsigned long long overflow_add (unsigned long long x, unsigned long long y)
-+{
-+  unsigned long long r;
-+
-+  int ovr = __builtin_uaddll_overflow (x, y, &r);
-+  if (ovr)
-+    overflow_handler ();
-+
-+  return r;
-+}
-+
-+/* { dg-final { scan-assembler "adds" } } */
-+/* { dg-final { scan-assembler "adcs" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/builtin_usubl.c
-@@ -0,0 +1,17 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2" }  */
-+/* { dg-require-effective-target arm32 } */
-+extern void overflow_handler ();
-+
-+unsigned long overflow_sub (unsigned long x, unsigned long y)
-+{
-+  unsigned long r;
-+
-+  int ovr = __builtin_usubl_overflow (x, y, &r);
-+  if (ovr)
-+    overflow_handler ();
-+
-+  return r;
-+}
-+
-+/* { dg-final { scan-assembler "subs" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/builtin_usubll.c
-@@ -0,0 +1,18 @@
-+/* { dg-do compile } */
-+/* { dg-options "-O2" }  */
-+/* { dg-require-effective-target arm32 } */
-+extern void overflow_handler ();
-+
-+unsigned long long overflow_sub (unsigned long long x, unsigned long long y)
-+{
-+  unsigned long long r;
-+
-+  int ovr = __builtin_usubll_overflow (x, y, &r);
-+  if (ovr)
-+    overflow_handler ();
-+
-+  return r;
-+}
-+
-+/* { dg-final { scan-assembler "subs" } } */
-+/* { dg-final { scan-assembler "sbcs" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cbz.c
-@@ -0,0 +1,12 @@
-+/* { dg-do compile {target { arm_thumb2 || arm_thumb1_cbz_ok } } } */
-+/* { dg-options "-O2" } */
-+
-+int
-+foo (int a, int *b)
-+{
-+  if (a)
-+    *b = 1;
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler-times "cbz\\tr\\d" 1 } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-4.c
-@@ -0,0 +1,57 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mcmse" } */
-+
-+typedef struct
-+{
-+  unsigned char a;
-+  unsigned int b:5;
-+  unsigned int c:11, :0, d:8;
-+  struct { unsigned int ee:2; } e;
-+} test_st;
-+
-+typedef union
-+{
-+  test_st st;
-+  struct
-+    {
-+      unsigned int v1;
-+      unsigned int v2;
-+      unsigned int v3;
-+      unsigned int v4;
-+    }values;
-+} read_st;
-+
-+
-+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
-+
-+extern void foo (test_st st);
-+
-+int
-+main (void)
-+{
-+  read_st r;
-+  foo_ns f;
-+
-+  f = (foo_ns) 0x200000;
-+  r.values.v1 = 0xFFFFFFFF;
-+  r.values.v2 = 0xFFFFFFFF;
-+  r.values.v3 = 0xFFFFFFFF;
-+  r.values.v4 = 0xFFFFFFFF;
-+
-+  f (r.st);
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler "mov\tip, r4" } } */
-+/* { dg-final { scan-assembler "movw\tr4, #65535" } } */
-+/* { dg-final { scan-assembler "movt\tr4, 255" } } */
-+/* { dg-final { scan-assembler "ands\tr0, r4" } } */
-+/* { dg-final { scan-assembler "movs\tr4, #255" } } */
-+/* { dg-final { scan-assembler "ands\tr1, r4" } } */
-+/* { dg-final { scan-assembler "movs\tr4, #3" } } */
-+/* { dg-final { scan-assembler "ands\tr2, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr4, ip" } } */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "movs\tr3, r4" } } */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-5.c
-@@ -0,0 +1,53 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mcmse" } */
-+
-+typedef struct
-+{
-+  unsigned char	  a;
-+  unsigned short  b :5;
-+  unsigned char	  c;
-+  unsigned short  d :11;
-+} test_st;
-+
-+typedef union
-+{
-+  test_st st;
-+  struct
-+    {
-+      unsigned int v1;
-+      unsigned int v2;
-+      unsigned int v3;
-+      unsigned int v4;
-+    }values;
-+} read_st;
-+
-+
-+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
-+
-+int
-+main (void)
-+{
-+  read_st r;
-+  foo_ns f;
-+
-+  f = (foo_ns) 0x200000;
-+  r.values.v1 = 0xFFFFFFFF;
-+  r.values.v2 = 0xFFFFFFFF;
-+
-+  f (r.st);
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler "mov\tip, r4" } } */
-+/* { dg-final { scan-assembler "movw\tr4, #8191" } } */
-+/* { dg-final { scan-assembler "movt\tr4, 255" } } */
-+/* { dg-final { scan-assembler "ands\tr0, r4" } } */
-+/* { dg-final { scan-assembler "movw\tr4, #2047" } } */
-+/* { dg-final { scan-assembler "ands\tr1, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr4, ip" } } */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "movs\tr2, r4" } } */
-+/* { dg-final { scan-assembler "movs\tr3, r4" } } */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-6.c
-@@ -0,0 +1,63 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mcmse" } */
-+
-+typedef struct
-+{
-+  unsigned char a;
-+  unsigned int	b : 3;
-+  unsigned int	c : 14;
-+  unsigned int	d : 1;
-+  struct {
-+      unsigned int    ee  : 2;
-+      unsigned short  ff  : 15;
-+  } e;
-+  unsigned char	g : 1;
-+  unsigned char	  : 4;
-+  unsigned char	h : 3;
-+} test_st;
-+
-+typedef union
-+{
-+  test_st st;
-+  struct
-+    {
-+      unsigned int v1;
-+      unsigned int v2;
-+      unsigned int v3;
-+      unsigned int v4;
-+    }values;
-+} read_st;
-+
-+
-+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
-+
-+int
-+main (void)
-+{
-+  read_st r;
-+  foo_ns f;
-+
-+  f = (foo_ns) 0x200000;
-+  r.values.v1 = 0xFFFFFFFF;
-+  r.values.v2 = 0xFFFFFFFF;
-+  r.values.v3 = 0xFFFFFFFF;
-+  r.values.v4 = 0xFFFFFFFF;
-+
-+  f (r.st);
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler "mov\tip, r4" } } */
-+/* { dg-final { scan-assembler "movw\tr4, #65535" } } */
-+/* { dg-final { scan-assembler "movt\tr4, 1023" } } */
-+/* { dg-final { scan-assembler "ands\tr0, r4" } } */
-+/* { dg-final { scan-assembler "movs\tr4, #3" } } */
-+/* { dg-final { scan-assembler "movt\tr4, 32767" } } */
-+/* { dg-final { scan-assembler "ands\tr1, r4" } } */
-+/* { dg-final { scan-assembler "movs\tr4, #255" } } */
-+/* { dg-final { scan-assembler "ands\tr2, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr4, ip" } } */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "movs\tr3, r4" } } */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-7.c
-@@ -0,0 +1,54 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mcmse" } */
-+
-+typedef struct
-+{
-+  unsigned char	  a;
-+  unsigned short  b :5;
-+  unsigned char	  c;
-+  unsigned short  d :11;
-+} test_st;
-+
-+typedef union
-+{
-+  test_st st;
-+  struct
-+    {
-+      unsigned int v1;
-+      unsigned int v2;
-+      unsigned int v3;
-+      unsigned int v4;
-+    }values;
-+} read_st;
-+
-+
-+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
-+
-+int
-+main (void)
-+{
-+  read_st r;
-+  foo_ns f;
-+
-+  f = (foo_ns) 0x200000;
-+  r.values.v1 = 0xFFFFFFFF;
-+  r.values.v2 = 0xFFFFFFFF;
-+
-+  f (r.st);
-+  return 0;
-+}
-+
-+
-+/* { dg-final { scan-assembler "mov\tip, r4" } } */
-+/* { dg-final { scan-assembler "movw\tr4, #8191" } } */
-+/* { dg-final { scan-assembler "movt\tr4, 255" } } */
-+/* { dg-final { scan-assembler "ands\tr0, r4" } } */
-+/* { dg-final { scan-assembler "movw\tr4, #2047" } } */
-+/* { dg-final { scan-assembler "ands\tr1, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr4, ip" } } */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "movs\tr2, r4" } } */
-+/* { dg-final { scan-assembler "movs\tr3, r4" } } */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-8.c
-@@ -0,0 +1,57 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mcmse" } */
-+
-+typedef struct
-+{
-+  unsigned char	  a;
-+  unsigned int	    :0;
-+  unsigned int	  b :1;
-+  unsigned short    :0;
-+  unsigned short  c;
-+  unsigned int	    :0;
-+  unsigned int	  d :21;
-+} test_st;
-+
-+typedef union
-+{
-+  test_st st;
-+  struct
-+    {
-+      unsigned int v1;
-+      unsigned int v2;
-+      unsigned int v3;
-+      unsigned int v4;
-+    }values;
-+} read_st;
-+
-+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
-+
-+int
-+main (void)
-+{
-+  read_st r;
-+  foo_ns f;
-+
-+  f = (foo_ns) 0x200000;
-+  r.values.v1 = 0xFFFFFFFF;
-+  r.values.v2 = 0xFFFFFFFF;
-+  r.values.v3 = 0xFFFFFFFF;
-+
-+  f (r.st);
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler "mov\tip, r4" } } */
-+/* { dg-final { scan-assembler "movs\tr4, #255" } } */
-+/* { dg-final { scan-assembler "ands\tr0, r4" } } */
-+/* { dg-final { scan-assembler "movs\tr4, #1" } } */
-+/* { dg-final { scan-assembler "movt\tr4, 65535" } } */
-+/* { dg-final { scan-assembler "ands\tr1, r4" } } */
-+/* { dg-final { scan-assembler "movw\tr4, #65535" } } */
-+/* { dg-final { scan-assembler "movt\tr4, 31" } } */
-+/* { dg-final { scan-assembler "ands\tr2, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr4, ip" } } */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "movs\tr3, r4" } } */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-9.c
-@@ -0,0 +1,56 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mcmse" } */
-+
-+typedef struct
-+{
-+  char a:3;
-+} test_st3;
-+
-+typedef struct
-+{
-+  char a:3;
-+} test_st2;
-+
-+typedef struct
-+{
-+  test_st2 st2;
-+  test_st3 st3;
-+} test_st;
-+
-+typedef union
-+{
-+  test_st st;
-+  struct
-+    {
-+      unsigned int v1;
-+      unsigned int v2;
-+      unsigned int v3;
-+      unsigned int v4;
-+    }values;
-+} read_st;
-+
-+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
-+
-+int
-+main (void)
-+{
-+  read_st r;
-+  foo_ns f;
-+
-+  f = (foo_ns) 0x200000;
-+  r.values.v1 = 0xFFFFFFFF;
-+
-+  f (r.st);
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler "mov\tip, r4" } } */
-+/* { dg-final { scan-assembler "movw\tr4, #1799" } } */
-+/* { dg-final { scan-assembler "ands\tr0, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr4, ip" } } */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "movs\tr1, r4" } } */
-+/* { dg-final { scan-assembler "movs\tr2, r4" } } */
-+/* { dg-final { scan-assembler "movs\tr3, r4" } } */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-and-union-1.c
-@@ -0,0 +1,96 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mcmse" } */
-+
-+typedef struct
-+{
-+  unsigned short a :11;
-+} test_st_4;
-+
-+typedef union
-+{
-+  char	      a;
-+  test_st_4 st4;
-+}test_un_2;
-+
-+typedef struct
-+{
-+  unsigned char	  a;
-+  unsigned int	    :0;
-+  unsigned int	  b :1;
-+  unsigned short    :0;
-+  unsigned short  c;
-+  unsigned int	    :0;
-+  unsigned int	  d :21;
-+} test_st_3;
-+
-+typedef struct
-+{
-+  unsigned char	  a :3;
-+  unsigned int	  b :13;
-+  test_un_2	  un2;
-+} test_st_2;
-+
-+typedef union
-+{
-+  test_st_2 st2;
-+  test_st_3 st3;
-+}test_un_1;
-+
-+typedef struct
-+{
-+  unsigned char	  a :2;
-+  unsigned char	    :0;
-+  unsigned short  b :5;
-+  unsigned char	    :0;
-+  unsigned char	  c :4;
-+  test_un_1	  un1;
-+} test_st_1;
-+
-+typedef union
-+{
-+  test_st_1 st1;
-+  struct
-+    {
-+      unsigned int v1;
-+      unsigned int v2;
-+      unsigned int v3;
-+      unsigned int v4;
-+    }values;
-+} read_st_1;
-+
-+
-+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st_1);
-+
-+int
-+main (void)
-+{
-+  read_st_1 r;
-+  foo_ns f;
-+
-+  f = (foo_ns) 0x200000;
-+  r.values.v1 = 0xFFFFFFFF;
-+  r.values.v2 = 0xFFFFFFFF;
-+  r.values.v3 = 0xFFFFFFFF;
-+  r.values.v4 = 0xFFFFFFFF;
-+
-+  f (r.st1);
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler "mov\tip, r4" } } */
-+/* { dg-final { scan-assembler "movw\tr4, #7939" } } */
-+/* { dg-final { scan-assembler "movt\tr4, 15" } } */
-+/* { dg-final { scan-assembler "ands\tr0, r4" } } */
-+/* { dg-final { scan-assembler "movw\tr4, #65535" } } */
-+/* { dg-final { scan-assembler "movt\tr4, 2047" } } */
-+/* { dg-final { scan-assembler "ands\tr1, r4" } } */
-+/* { dg-final { scan-assembler "movs\tr4, #1" } } */
-+/* { dg-final { scan-assembler "movt\tr4, 65535" } } */
-+/* { dg-final { scan-assembler "ands\tr2, r4" } } */
-+/* { dg-final { scan-assembler "movw\tr4, #65535" } } */
-+/* { dg-final { scan-assembler "movt\tr4, 31" } } */
-+/* { dg-final { scan-assembler "ands\tr3, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr4, ip" } } */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-11.c
-@@ -0,0 +1,22 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
-+/* { dg-add-options arm_arch_v8m_base } */
-+/* { dg-options "-mcmse" }  */
-+
-+int __attribute__ ((cmse_nonsecure_call)) (*bar) (int);
-+
-+int
-+foo (int a)
-+{
-+  return bar (bar (a + 1));
-+}
-+
-+/* Checks for saving and clearing prior to function call.  */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "movs\tr1, r4" } } */
-+/* { dg-final { scan-assembler "movs\tr2, r4" } } */
-+/* { dg-final { scan-assembler "movs\tr3, r4" } } */
-+
-+/* Now we check that we use the correct intrinsic to call.  */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-13.c
-@@ -0,0 +1,25 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
-+/* { dg-add-options arm_arch_v8m_base } */
-+/* { dg-options "-mcmse" } */
-+
-+int __attribute__ ((cmse_nonsecure_call)) (*bar) (float, double);
-+
-+int
-+foo (int a)
-+{
-+  return bar (1.0f, 2.0) + a + 1;
-+}
-+
-+/* Checks for saving and clearing prior to function call.  */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler-not "movs\tr0, r4" } } */
-+/* { dg-final { scan-assembler "\n\tmovs\tr1, r4" } } */
-+/* { dg-final { scan-assembler-not "\n\tmovs\tr2, r4\n\tmovs\tr3, r4" } } */
-+/* { dg-final { scan-assembler-not "vmov" } } */
-+/* { dg-final { scan-assembler-not "vmsr" } } */
-+
-+/* Now we check that we use the correct intrinsic to call.  */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-2.c
-@@ -0,0 +1,19 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
-+/* { dg-add-options arm_arch_v8m_base } */
-+/* { dg-options "-mcmse" }  */
-+
-+extern float bar (void);
-+
-+float __attribute__ ((cmse_nonsecure_entry))
-+foo (void)
-+{
-+  return bar ();
-+}
-+/* { dg-final { scan-assembler "movs\tr1, r0" } } */
-+/* { dg-final { scan-assembler "movs\tr2, r0" } } */
-+/* { dg-final { scan-assembler "movs\tr3, r0" } } */
-+/* { dg-final { scan-assembler "mov\tip, r0" } } */
-+/* { dg-final { scan-assembler "mov\tlr, r0" } } */
-+/* { dg-final { scan-assembler "msr\tAPSR_nzcvq," } } */
-+/* { dg-final { scan-assembler "bxns" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-6.c
-@@ -0,0 +1,21 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
-+/* { dg-add-options arm_arch_v8m_base } */
-+/* { dg-options "-mcmse" }  */
-+
-+int __attribute__ ((cmse_nonsecure_call)) (*bar) (double);
-+
-+int
-+foo (int a)
-+{
-+  return bar (2.0) + a + 1;
-+}
-+
-+/* Remember dont clear r0 and r1, because we are passing the double parameter
-+ * for bar in them.  */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "movs\tr2, r4" } } */
-+
-+/* Now we check that we use the correct intrinsic to call.  */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/softfp.c
-@@ -0,0 +1,29 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_base_ok } */
-+/* { dg-add-options arm_arch_v8m_base } */
-+/* { dg-options "-mcmse -mfloat-abi=softfp" } */
-+
-+double __attribute__ ((cmse_nonsecure_call)) (*bar) (float, double);
-+
-+double
-+foo (double a)
-+{
-+  return bar (1.0f, 2.0) + a;
-+}
-+
-+float __attribute__ ((cmse_nonsecure_entry))
-+baz (float a, double b)
-+{
-+  return (float) bar (a, b);
-+}
-+
-+/* Make sure we are not using FP instructions, since ARMv8-M Baseline does not
-+   support such instructions.  */
-+/* { dg-final { scan-assembler-not "vmov" } } */
-+/* { dg-final { scan-assembler-not "vmsr" } } */
-+/* { dg-final { scan-assembler-not "vmrs" } } */
-+
-+/* Just double checking that we are still doing cmse though.  */
-+/* { dg-final { scan-assembler-not "vmrs" } } */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/union-1.c
-@@ -0,0 +1,71 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mcmse" } */
-+
-+typedef struct
-+{
-+  unsigned char	  a :2;
-+  unsigned char	    :0;
-+  unsigned short  b :5;
-+  unsigned char	    :0;
-+  unsigned short  c :3;
-+  unsigned char	    :0;
-+  unsigned int	  d :9;
-+} test_st_1;
-+
-+typedef struct
-+{
-+  unsigned short  a :7;
-+  unsigned char	    :0;
-+  unsigned char	  b :1;
-+  unsigned char	    :0;
-+  unsigned short  c :6;
-+} test_st_2;
-+
-+typedef union
-+{
-+  test_st_1 st_1;
-+  test_st_2 st_2;
-+}test_un;
-+
-+typedef union
-+{
-+  test_un un;
-+  struct
-+    {
-+      unsigned int v1;
-+      unsigned int v2;
-+      unsigned int v3;
-+      unsigned int v4;
-+    }values;
-+} read_un;
-+
-+
-+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_un);
-+
-+int
-+main (void)
-+{
-+  read_un r;
-+  foo_ns f;
-+
-+  f = (foo_ns) 0x200000;
-+  r.values.v1 = 0xFFFFFFFF;
-+  r.values.v2 = 0xFFFFFFFF;
-+
-+  f (r.un);
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler "mov\tip, r4" } } */
-+/* { dg-final { scan-assembler "movw\tr4, #8063" } } */
-+/* { dg-final { scan-assembler "movt\tr4, 63" } } */
-+/* { dg-final { scan-assembler "ands\tr0, r4" } } */
-+/* { dg-final { scan-assembler "movw\tr4, #511" } } */
-+/* { dg-final { scan-assembler "ands\tr1, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr4, ip" } } */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "movs\tr2, r4" } } */
-+/* { dg-final { scan-assembler "movs\tr3, r4" } } */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/baseline/union-2.c
-@@ -0,0 +1,86 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mcmse" } */
-+
-+typedef struct
-+{
-+  unsigned char	  a :2;
-+  unsigned char	    :0;
-+  unsigned short  b :5;
-+  unsigned char	    :0;
-+  unsigned short  c :3;
-+  unsigned char	    :0;
-+  unsigned int	  d :9;
-+} test_st_1;
-+
-+typedef struct
-+{
-+  unsigned short  a :7;
-+  unsigned char	    :0;
-+  unsigned char	  b :1;
-+  unsigned char	    :0;
-+  unsigned short  c :6;
-+} test_st_2;
-+
-+typedef struct
-+{
-+  unsigned char	  a;
-+  unsigned int	    :0;
-+  unsigned int	  b :1;
-+  unsigned short    :0;
-+  unsigned short  c;
-+  unsigned int	    :0;
-+  unsigned int	  d :21;
-+} test_st_3;
-+
-+typedef union
-+{
-+  test_st_1 st_1;
-+  test_st_2 st_2;
-+  test_st_3 st_3;
-+}test_un;
-+
-+typedef union
-+{
-+  test_un un;
-+  struct
-+    {
-+      unsigned int v1;
-+      unsigned int v2;
-+      unsigned int v3;
-+      unsigned int v4;
-+    }values;
-+} read_un;
-+
-+
-+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_un);
-+
-+int
-+main (void)
-+{
-+  read_un r;
-+  foo_ns f;
-+
-+  f = (foo_ns) 0x200000;
-+  r.values.v1 = 0xFFFFFFFF;
-+  r.values.v2 = 0xFFFFFFFF;
-+  r.values.v3 = 0xFFFFFFFF;
-+
-+  f (r.un);
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler "mov\tip, r4" } } */
-+/* { dg-final { scan-assembler "movw\tr4, #8191" } } */
-+/* { dg-final { scan-assembler "movt\tr4, 63" } } */
-+/* { dg-final { scan-assembler "ands\tr0, r4" } } */
-+/* { dg-final { scan-assembler "movw\tr4, #511" } } */
-+/* { dg-final { scan-assembler "movt\tr4, 65535" } } */
-+/* { dg-final { scan-assembler "ands\tr1, r4" } } */
-+/* { dg-final { scan-assembler "movw\tr4, #65535" } } */
-+/* { dg-final { scan-assembler "movt\tr4, 31" } } */
-+/* { dg-final { scan-assembler "ands\tr2, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr4, ip" } } */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "movs\tr3, r4" } } */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/bitfield-1.c
-@@ -0,0 +1,39 @@
-+/* { dg-do run } */
-+/* { dg-options "--save-temps -mcmse -Wl,--section-start,.gnu.sgstubs=0x20400000" } */
-+
-+typedef struct
-+{
-+  unsigned short  a : 6;
-+  unsigned char	  b : 3;
-+  unsigned char	  c;
-+  unsigned short  d : 8;
-+} test_st;
-+
-+test_st __attribute__ ((cmse_nonsecure_entry)) foo (void)
-+{
-+  test_st t;
-+  t.a = 63u;
-+  t.b = 7u;
-+  t.c = 255u;
-+  t.d = 255u;
-+  return t;
-+}
-+
-+int
-+main (void)
-+{
-+  test_st t;
-+  t = foo ();
-+  if (t.a != 63u
-+      || t.b != 7u
-+      || t.c != 255u
-+      || t.d != 255u)
-+    __builtin_abort ();
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler "movw\tr1, #1855" } } */
-+/* { dg-final { scan-assembler "movt\tr1, 65535" } } */
-+/* { dg-final { scan-assembler "ands\tr0(, r0)?, r1" } } */
-+/* { dg-final { scan-assembler "bxns" } } */
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/bitfield-2.c
-@@ -0,0 +1,36 @@
-+/* { dg-do run } */
-+/* { dg-options "--save-temps -mcmse -Wl,--section-start,.gnu.sgstubs=0x20400000" } */
-+
-+typedef struct
-+{
-+  short	      a : 7;
-+  signed char b : 3;
-+  short	      c : 11;
-+} test_st;
-+
-+test_st __attribute__ ((cmse_nonsecure_entry)) foo (void)
-+{
-+  test_st t;
-+  t.a = -64;
-+  t.b = -4 ;
-+  t.c = -1024;
-+  return t;
-+}
-+
-+int
-+main (void)
-+{
-+  test_st t;
-+  t = foo ();
-+  if (t.a != -64
-+      || t.b != -4
-+      || t.c != -1024)
-+    __builtin_abort ();
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler "movw\tr1, #1919" } } */
-+/* { dg-final { scan-assembler "movt\tr1, 2047" } } */
-+/* { dg-final { scan-assembler "ands\tr0(, r0)?, r1" } } */
-+/* { dg-final { scan-assembler "bxns" } } */
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/bitfield-3.c
-@@ -0,0 +1,37 @@
-+/* { dg-do run } */
-+/* { dg-options "--save-temps -mcmse -Wl,--section-start,.gnu.sgstubs=0x20400000" } */
-+
-+typedef struct
-+{
-+  short	      a;
-+  signed char b : 2;
-+  short		: 1;
-+  signed char c : 3;
-+} test_st;
-+
-+test_st __attribute__ ((cmse_nonsecure_entry)) foo (void)
-+{
-+  test_st t;
-+  t.a = -32768;
-+  t.b = -2;
-+  t.c = -4;
-+  return t;
-+}
-+
-+int
-+main (void)
-+{
-+  test_st t;
-+  t = foo ();
-+  if (t.a != -32768
-+      || t.b != -2
-+      || t.c != -4)
-+    __builtin_abort ();
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler "movw\tr1, #65535" } } */
-+/* { dg-final { scan-assembler "movt\tr1, 63" } } */
-+/* { dg-final { scan-assembler "ands\tr0(, r0)?, r1" } } */
-+/* { dg-final { scan-assembler "bxns" } } */
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse-1.c
-@@ -0,0 +1,106 @@
-+/* { dg-do compile } */
-+/* { dg-options "-Os -mcmse -fdump-rtl-expand" }  */
-+
-+#include <arm_cmse.h>
-+
-+extern int a;
-+extern int bar (void);
-+
-+int foo (char * p)
-+{
-+  cmse_address_info_t cait;
-+
-+  cait = cmse_TT (&a);
-+  if (cait.flags.mpu_region)
-+    a++;
-+
-+  cait = cmse_TT_fptr (&bar);
-+  if (cait.flags.mpu_region)
-+    a+= bar ();
-+
-+  cait = cmse_TTA (&a);
-+  if (cait.flags.mpu_region)
-+    a++;
-+
-+  cait = cmse_TTA_fptr (&bar);
-+  if (cait.flags.mpu_region)
-+    a+= bar ();
-+
-+  cait = cmse_TTT (&a);
-+  if (cait.flags.mpu_region)
-+    a++;
-+
-+  cait = cmse_TTT_fptr (&bar);
-+  if (cait.flags.mpu_region)
-+    a+= bar ();
-+
-+  cait = cmse_TTAT (&a);
-+  if (cait.flags.mpu_region)
-+    a++;
-+
-+  cait = cmse_TTAT_fptr (&bar);
-+  if (cait.flags.mpu_region)
-+    a+= bar ();
-+
-+  p = (char *) cmse_check_address_range ((void *) p, sizeof (char), 0);
-+  p = (char *) cmse_check_address_range ((void *) p, sizeof (char),
-+					 CMSE_MPU_UNPRIV);
-+  p = (char *) cmse_check_address_range ((void *) p, sizeof (char),
-+					 CMSE_MPU_READWRITE);
-+  p = (char *) cmse_check_address_range ((void *) p, sizeof (char),
-+					 CMSE_MPU_UNPRIV | CMSE_MPU_READ);
-+  p = (char *) cmse_check_address_range ((void *) p, sizeof (char),
-+					 CMSE_AU_NONSECURE
-+					 | CMSE_MPU_NONSECURE);
-+  p = (char *) cmse_check_address_range ((void *) p, sizeof (char),
-+					 CMSE_NONSECURE | CMSE_MPU_UNPRIV);
-+
-+  p = (char *) cmse_check_pointed_object (p, CMSE_NONSECURE | CMSE_MPU_UNPRIV);
-+
-+  return a;
-+}
-+/* { dg-final { scan-assembler-times "\ttt " 2 } } */
-+/* { dg-final { scan-assembler-times "ttt " 2 } } */
-+/* { dg-final { scan-assembler-times "tta " 2 } } */
-+/* { dg-final { scan-assembler-times "ttat " 2 } } */
-+/* { dg-final { scan-assembler-times "bl.cmse_check_address_range" 7 } } */
-+/* { dg-final { scan-assembler-not "cmse_check_pointed_object" } } */
-+
-+int __attribute__ ((cmse_nonsecure_entry))
-+baz (void)
-+{
-+  return cmse_nonsecure_caller ();
-+}
-+
-+typedef int __attribute__ ((cmse_nonsecure_call)) (int_nsfunc_t) (void);
-+
-+int default_callback (void)
-+{
-+  return 0;
-+}
-+
-+int_nsfunc_t * fp = (int_nsfunc_t *) default_callback;
-+
-+void __attribute__ ((cmse_nonsecure_entry))
-+qux (int_nsfunc_t * callback)
-+{
-+  fp = cmse_nsfptr_create (callback);
-+}
-+
-+int call_callback (void)
-+{
-+  if (cmse_is_nsfptr (fp))
-+      return fp ();
-+  else
-+    return default_callback ();
-+}
-+/* { dg-final { scan-assembler "baz:" } } */
-+/* { dg-final { scan-assembler "__acle_se_baz:" } } */
-+/* { dg-final { scan-assembler "qux:" } } */
-+/* { dg-final { scan-assembler "__acle_se_qux:" } } */
-+/* { dg-final { scan-assembler-not "\tcmse_nonsecure_caller" } } */
-+/* { dg-final { scan-rtl-dump "and.*reg.*const_int 1" expand } } */
-+/* { dg-final { scan-assembler "bic" } } */
-+/* { dg-final { scan-assembler "push\t\{r4, r5, r6" } } */
-+/* { dg-final { scan-assembler "msr\tAPSR_nzcvq" } } */
-+/* { dg-final { scan-assembler-times "bl\\s+__gnu_cmse_nonsecure_call" 1 } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse-10.c
-@@ -0,0 +1,9 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mcmse" }  */
-+
-+void
-+foo (void) {}
-+
-+/* { dg-final { scan-assembler-not "bxns" } } */
-+/* { dg-final { scan-assembler "foo:" } } */
-+/* { dg-final { scan-assembler-not "__acle_se_foo:" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse-12.c
-@@ -0,0 +1,14 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mcmse" }  */
-+#include <arm_cmse.h>
-+
-+char *
-+foo (char * p)
-+{
-+  if (!cmse_is_nsfptr (p))
-+    return cmse_nsfptr_create (p);
-+}
-+
-+/* Checks for saving and clearing prior to function call.  */
-+/* { dg-final { scan-assembler-not "cmse_is_nsfptr" } } */
-+/* { dg-final { scan-assembler-not "cmse_nsfptr_create" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse-14.c
-@@ -0,0 +1,13 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mcmse" } */
-+
-+
-+int __attribute__ ((cmse_nonsecure_call)) (*bar) (void);
-+
-+int foo (void)
-+{
-+  return bar ();
-+}
-+
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
-+/* { dg-final { scan-assembler-not "b\[^ y\n\]*\\s+bar" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse-15.c
-@@ -0,0 +1,72 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mcmse" } */
-+
-+int __attribute__ ((cmse_nonsecure_call)) (*ns_foo) (void);
-+int (*s_bar) (void);
-+int __attribute__ ((cmse_nonsecure_call)) (**ns_foo2) (void);
-+int (**s_bar2) (void);
-+
-+typedef int __attribute__ ((cmse_nonsecure_call)) ns_foo_t (void);
-+typedef int s_bar_t (void);
-+typedef int __attribute__ ((cmse_nonsecure_call)) (* ns_foo_ptr) (void);
-+typedef int (*s_bar_ptr) (void);
-+
-+int nonsecure0 (ns_foo_t * ns_foo_p)
-+{
-+  return ns_foo_p ();
-+}
-+
-+int nonsecure1 (ns_foo_t ** ns_foo_p)
-+{
-+  return (*ns_foo_p) ();
-+}
-+
-+int nonsecure2 (ns_foo_ptr ns_foo_p)
-+{
-+  return ns_foo_p ();
-+}
-+int nonsecure3 (ns_foo_ptr * ns_foo_p)
-+{
-+  return (*ns_foo_p) ();
-+}
-+
-+int secure0 (s_bar_t * s_bar_p)
-+{
-+  return s_bar_p ();
-+}
-+
-+int secure1 (s_bar_t ** s_bar_p)
-+{
-+  return (*s_bar_p) ();
-+}
-+
-+int secure2 (s_bar_ptr s_bar_p)
-+{
-+  return s_bar_p ();
-+}
-+
-+int secure3 (s_bar_ptr * s_bar_p)
-+{
-+  return (*s_bar_p) ();
-+}
-+
-+int nonsecure4 (void)
-+{
-+  return ns_foo ();
-+}
-+
-+int nonsecure5 (void)
-+{
-+  return (*ns_foo2) ();
-+}
-+
-+int secure4 (void)
-+{
-+  return s_bar ();
-+}
-+
-+int secure5 (void)
-+{
-+  return (*s_bar2) ();
-+}
-+/* { dg-final { scan-assembler-times "bl\\s+__gnu_cmse_nonsecure_call" 6 } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse-3.c
-@@ -0,0 +1,45 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mcmse" }  */
-+
-+struct span {
-+  int a, b;
-+};
-+struct span2 {
-+  float a, b, c, d;
-+};
-+
-+union test_union
-+{
-+  long long a;
-+  int b;
-+  struct span2 c;
-+} test_union;
-+
-+void __attribute__ ((cmse_nonsecure_entry))
-+foo (long long a, int b, long long c) {} /* { dg-error "not available to functions with arguments passed on the stack" } */
-+
-+void __attribute__ ((cmse_nonsecure_entry))
-+bar (long long a, int b, struct span c) {} /* { dg-error "not available to functions with arguments passed on the stack" } */
-+
-+void __attribute__ ((cmse_nonsecure_entry))
-+baz (int a, ...) {} /* { dg-error "not available to functions with variable number of arguments" } */
-+
-+struct span __attribute__ ((cmse_nonsecure_entry))
-+qux (void) { /* { dg-error "not available to functions that return value on the stack" } */
-+  struct span ret = {0, 0};
-+  return ret;
-+}
-+
-+void __attribute__ ((cmse_nonsecure_entry))
-+norf (struct span2 a) {}
-+
-+void __attribute__ ((cmse_nonsecure_entry))
-+foo2 (long long a, int b, union test_union c) {} /* { dg-error "not available to functions with arguments passed on the stack" } */
-+
-+typedef void __attribute__ ((cmse_nonsecure_call)) bar2 (long long a, int b, long long c); /* { dg-error "not available to functions with arguments passed on the stack" } */
-+
-+typedef void __attribute__ ((cmse_nonsecure_call)) baz2 (long long a, int b, struct span c); /* { dg-error "not available to functions with arguments passed on the stack" } */
-+
-+typedef struct span __attribute__ ((cmse_nonsecure_call)) qux2 (void); /* { dg-error "not available to functions that return value on the stack" } */
-+
-+typedef void __attribute__ ((cmse_nonsecure_call)) norf2 (int a, ...); /* { dg-error "not available to functions with variable number of arguments" } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse-4.c
-@@ -0,0 +1,34 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mcmse" }  */
-+
-+struct span {
-+  int a, b;
-+};
-+
-+extern int qux (void);
-+
-+void __attribute__ ((cmse_nonsecure_entry))
-+foo (void) {}
-+
-+static void __attribute__ ((cmse_nonsecure_entry))
-+bar (void) {} /* { dg-warning "has no effect on functions with static linkage" } */
-+
-+int __attribute__ ((cmse_nonsecure_entry))
-+baz (void)
-+{
-+  return qux ();
-+}
-+
-+void __attribute__ ((cmse_nonsecure_call))
-+quux (void) {} /* { dg-warning "attribute only applies to base type of a function pointer" } */
-+
-+int __attribute__ ((cmse_nonsecure_call)) norf; /* { dg-warning "attribute only applies to base type of a function pointer" } */
-+
-+/* { dg-final { scan-assembler-times "bxns" 2 } } */
-+/* { dg-final { scan-assembler "foo:" } } */
-+/* { dg-final { scan-assembler "__acle_se_foo:" } } */
-+/* { dg-final { scan-assembler-not "__acle_se_bar:" } } */
-+/* { dg-final { scan-assembler "baz:" } } */
-+/* { dg-final { scan-assembler "__acle_se_baz:" } } */
-+/* { dg-final { scan-assembler-not "__acle_se_quux:" } } */
-+/* { dg-final { scan-assembler-not "__acle_se_norf:" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse-9.c
-@@ -0,0 +1,20 @@
-+/* { dg-do compile } */
-+/* { dg-skip-if "Testing exclusion of -mcmse" { arm-*-* } { "-mcmse" } { "" } }  */
-+
-+
-+void __attribute__ ((cmse_nonsecure_call)) (*bar) (int); /* { dg-warning "attribute ignored without -mcmse option" } */
-+typedef void __attribute__ ((cmse_nonsecure_call)) baz (int); /* { dg-warning "attribute ignored without -mcmse option" } */
-+
-+int __attribute__ ((cmse_nonsecure_entry))
-+foo (int a, baz b)
-+{ /* { dg-warning "attribute ignored without -mcmse option" } */
-+  bar (a);
-+  b (a);
-+  return a + 1;
-+}
-+
-+/* { dg-final { scan-assembler-not "bxns" } } */
-+/* { dg-final { scan-assembler-not "blxns" } } */
-+/* { dg-final { scan-assembler-not "bl\t__gnu_cmse_nonsecure_call" } } */
-+/* { dg-final { scan-assembler "foo:" } } */
-+/* { dg-final { scan-assembler-not "__acle_se_foo:" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/cmse.exp
-@@ -0,0 +1,72 @@
-+#   Copyright (C) 1997-2016 Free Software Foundation, Inc.
-+
-+# This program is free software; you can redistribute it and/or modify
-+# it under the terms of the GNU General Public License as published by
-+# the Free Software Foundation; either version 3 of the License, or
-+# (at your option) any later version.
-+#
-+# This program is distributed in the hope that it will be useful,
-+# but WITHOUT ANY WARRANTY; without even the implied warranty of
-+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-+# GNU General Public License for more details.
-+#
-+# You should have received a copy of the GNU General Public License
-+# along with GCC; see the file COPYING3.  If not see
-+# <http://www.gnu.org/licenses/>.
-+
-+# GCC testsuite for ARMv8-M Security Extensions using the `dg.exp' driver.
-+
-+# Load support procs.
-+load_lib gcc-dg.exp
-+
-+# Exit immediately if the target does not support -mcmse.
-+if ![check_effective_target_arm_cmse_ok] then {
-+    return
-+}
-+
-+# If a testcase doesn't have special options, use these.
-+global DEFAULT_CFLAGS
-+if ![info exists DEFAULT_CFLAGS] then {
-+    set DEFAULT_CFLAGS " -ansi -pedantic-errors"
-+}
-+
-+# Initialize `dg'.
-+dg-init
-+
-+set saved-dg-do-what-default ${dg-do-what-default}
-+set dg-do-what-default "assemble"
-+
-+set saved-lto_torture_options ${LTO_TORTURE_OPTIONS}
-+set LTO_TORTURE_OPTIONS ""
-+
-+# These are for both baseline and mainline.
-+gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] \
-+	"" $DEFAULT_CFLAGS
-+
-+if {[check_effective_target_arm_arch_v8m_base_ok]} then {
-+    # Baseline only
-+    gcc-dg-runtest [lsort [glob $srcdir/$subdir/baseline/*.c]] \
-+	    "" $DEFAULT_CFLAGS
-+}
-+
-+if {[check_effective_target_arm_arch_v8m_main_ok]} then {
-+    gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/*.c]] \
-+	    "" $DEFAULT_CFLAGS
-+    # Mainline -mfloat-abi=soft
-+    gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/soft/*.c]] \
-+	    "-mfloat-abi=soft" $DEFAULT_CFLAGS
-+    gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/softfp/*.c]] \
-+	    "" $DEFAULT_CFLAGS
-+    gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/softfp-sp/*.c]] \
-+	    "" $DEFAULT_CFLAGS
-+    gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/hard/*.c]] \
-+	    "" $DEFAULT_CFLAGS
-+    gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/hard-sp/*.c]] \
-+	    "" $DEFAULT_CFLAGS
-+}
-+
-+set LTO_TORTURE_OPTIONS ${saved-lto_torture_options}
-+set dg-do-what-default ${saved-dg-do-what-default}
-+
-+# All done.
-+dg-finish
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-4.c
-@@ -0,0 +1,55 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mcmse" } */
-+
-+typedef struct
-+{
-+  unsigned char a;
-+  unsigned int b:5;
-+  unsigned int c:11, :0, d:8;
-+  struct { unsigned int ee:2; } e;
-+} test_st;
-+
-+typedef union
-+{
-+  test_st st;
-+  struct
-+    {
-+      unsigned int v1;
-+      unsigned int v2;
-+      unsigned int v3;
-+      unsigned int v4;
-+    }values;
-+} read_st;
-+
-+
-+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
-+
-+extern void foo (test_st st);
-+
-+int
-+main (void)
-+{
-+  read_st r;
-+  foo_ns f;
-+
-+  f = (foo_ns) 0x200000;
-+  r.values.v1 = 0xFFFFFFFF;
-+  r.values.v2 = 0xFFFFFFFF;
-+  r.values.v3 = 0xFFFFFFFF;
-+  r.values.v4 = 0xFFFFFFFF;
-+
-+  f (r.st);
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler "movw\tip, #65535" } } */
-+/* { dg-final { scan-assembler "movt\tip, 255" } } */
-+/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
-+/* { dg-final { scan-assembler "mov\tip, #255" } } */
-+/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */
-+/* { dg-final { scan-assembler "mov\tip, #3" } } */
-+/* { dg-final { scan-assembler "and\tr2, r2, ip" } } */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-5.c
-@@ -0,0 +1,51 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mcmse" } */
-+
-+typedef struct
-+{
-+  unsigned char	  a;
-+  unsigned short  b :5;
-+  unsigned char	  c;
-+  unsigned short  d :11;
-+} test_st;
-+
-+typedef union
-+{
-+  test_st st;
-+  struct
-+    {
-+      unsigned int v1;
-+      unsigned int v2;
-+      unsigned int v3;
-+      unsigned int v4;
-+    }values;
-+} read_st;
-+
-+
-+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
-+
-+int
-+main (void)
-+{
-+  read_st r;
-+  foo_ns f;
-+
-+  f = (foo_ns) 0x200000;
-+  r.values.v1 = 0xFFFFFFFF;
-+  r.values.v2 = 0xFFFFFFFF;
-+
-+  f (r.st);
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler "movw\tip, #8191" } } */
-+/* { dg-final { scan-assembler "movt\tip, 255" } } */
-+/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
-+/* { dg-final { scan-assembler "movw\tip, #2047" } } */
-+/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-6.c
-@@ -0,0 +1,61 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mcmse" } */
-+
-+typedef struct
-+{
-+  unsigned char a;
-+  unsigned int	b : 3;
-+  unsigned int	c : 14;
-+  unsigned int	d : 1;
-+  struct {
-+      unsigned int    ee  : 2;
-+      unsigned short  ff  : 15;
-+  } e;
-+  unsigned char	g : 1;
-+  unsigned char	  : 4;
-+  unsigned char	h : 3;
-+} test_st;
-+
-+typedef union
-+{
-+  test_st st;
-+  struct
-+    {
-+      unsigned int v1;
-+      unsigned int v2;
-+      unsigned int v3;
-+      unsigned int v4;
-+    }values;
-+} read_st;
-+
-+
-+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
-+
-+int
-+main (void)
-+{
-+  read_st r;
-+  foo_ns f;
-+
-+  f = (foo_ns) 0x200000;
-+  r.values.v1 = 0xFFFFFFFF;
-+  r.values.v2 = 0xFFFFFFFF;
-+  r.values.v3 = 0xFFFFFFFF;
-+  r.values.v4 = 0xFFFFFFFF;
-+
-+  f (r.st);
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler "movw\tip, #65535" } } */
-+/* { dg-final { scan-assembler "movt\tip, 1023" } } */
-+/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
-+/* { dg-final { scan-assembler "mov\tip, #3" } } */
-+/* { dg-final { scan-assembler "movt\tip, 32767" } } */
-+/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */
-+/* { dg-final { scan-assembler "mov\tip, #255" } } */
-+/* { dg-final { scan-assembler "and\tr2, r2, ip" } } */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-7.c
-@@ -0,0 +1,52 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mcmse" } */
-+
-+typedef struct
-+{
-+  unsigned char	  a;
-+  unsigned short  b :5;
-+  unsigned char	  c;
-+  unsigned short  d :11;
-+} test_st;
-+
-+typedef union
-+{
-+  test_st st;
-+  struct
-+    {
-+      unsigned int v1;
-+      unsigned int v2;
-+      unsigned int v3;
-+      unsigned int v4;
-+    }values;
-+} read_st;
-+
-+
-+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
-+
-+int
-+main (void)
-+{
-+  read_st r;
-+  foo_ns f;
-+
-+  f = (foo_ns) 0x200000;
-+  r.values.v1 = 0xFFFFFFFF;
-+  r.values.v2 = 0xFFFFFFFF;
-+
-+  f (r.st);
-+  return 0;
-+}
-+
-+
-+/* { dg-final { scan-assembler "movw\tip, #8191" } } */
-+/* { dg-final { scan-assembler "movt\tip, 255" } } */
-+/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
-+/* { dg-final { scan-assembler "movw\tip, #2047" } } */
-+/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-8.c
-@@ -0,0 +1,55 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mcmse" } */
-+
-+typedef struct
-+{
-+  unsigned char	  a;
-+  unsigned int	    :0;
-+  unsigned int	  b :1;
-+  unsigned short    :0;
-+  unsigned short  c;
-+  unsigned int	    :0;
-+  unsigned int	  d :21;
-+} test_st;
-+
-+typedef union
-+{
-+  test_st st;
-+  struct
-+    {
-+      unsigned int v1;
-+      unsigned int v2;
-+      unsigned int v3;
-+      unsigned int v4;
-+    }values;
-+} read_st;
-+
-+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
-+
-+int
-+main (void)
-+{
-+  read_st r;
-+  foo_ns f;
-+
-+  f = (foo_ns) 0x200000;
-+  r.values.v1 = 0xFFFFFFFF;
-+  r.values.v2 = 0xFFFFFFFF;
-+  r.values.v3 = 0xFFFFFFFF;
-+
-+  f (r.st);
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler "mov\tip, #255" } } */
-+/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
-+/* { dg-final { scan-assembler "mov\tip, #1" } } */
-+/* { dg-final { scan-assembler "movt\tip, 65535" } } */
-+/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */
-+/* { dg-final { scan-assembler "movw\tip, #65535" } } */
-+/* { dg-final { scan-assembler "movt\tip, 31" } } */
-+/* { dg-final { scan-assembler "and\tr2, r2, ip" } } */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-9.c
-@@ -0,0 +1,54 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mcmse" } */
-+
-+typedef struct
-+{
-+  char a:3;
-+} test_st3;
-+
-+typedef struct
-+{
-+  char a:3;
-+} test_st2;
-+
-+typedef struct
-+{
-+  test_st2 st2;
-+  test_st3 st3;
-+} test_st;
-+
-+typedef union
-+{
-+  test_st st;
-+  struct
-+    {
-+      unsigned int v1;
-+      unsigned int v2;
-+      unsigned int v3;
-+      unsigned int v4;
-+    }values;
-+} read_st;
-+
-+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st);
-+
-+int
-+main (void)
-+{
-+  read_st r;
-+  foo_ns f;
-+
-+  f = (foo_ns) 0x200000;
-+  r.values.v1 = 0xFFFFFFFF;
-+
-+  f (r.st);
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler "movw\tip, #1799" } } */
-+/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "mov\tr1, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-and-union-1.c
-@@ -0,0 +1,94 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mcmse" } */
-+
-+typedef struct
-+{
-+  unsigned short a :11;
-+} test_st_4;
-+
-+typedef union
-+{
-+  char	      a;
-+  test_st_4 st4;
-+}test_un_2;
-+
-+typedef struct
-+{
-+  unsigned char	  a;
-+  unsigned int	    :0;
-+  unsigned int	  b :1;
-+  unsigned short    :0;
-+  unsigned short  c;
-+  unsigned int	    :0;
-+  unsigned int	  d :21;
-+} test_st_3;
-+
-+typedef struct
-+{
-+  unsigned char	  a :3;
-+  unsigned int	  b :13;
-+  test_un_2	  un2;
-+} test_st_2;
-+
-+typedef union
-+{
-+  test_st_2 st2;
-+  test_st_3 st3;
-+}test_un_1;
-+
-+typedef struct
-+{
-+  unsigned char	  a :2;
-+  unsigned char	    :0;
-+  unsigned short  b :5;
-+  unsigned char	    :0;
-+  unsigned char	  c :4;
-+  test_un_1	  un1;
-+} test_st_1;
-+
-+typedef union
-+{
-+  test_st_1 st1;
-+  struct
-+    {
-+      unsigned int v1;
-+      unsigned int v2;
-+      unsigned int v3;
-+      unsigned int v4;
-+    }values;
-+} read_st_1;
-+
-+
-+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st_1);
-+
-+int
-+main (void)
-+{
-+  read_st_1 r;
-+  foo_ns f;
-+
-+  f = (foo_ns) 0x200000;
-+  r.values.v1 = 0xFFFFFFFF;
-+  r.values.v2 = 0xFFFFFFFF;
-+  r.values.v3 = 0xFFFFFFFF;
-+  r.values.v4 = 0xFFFFFFFF;
-+
-+  f (r.st1);
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler "movw\tip, #7939" } } */
-+/* { dg-final { scan-assembler "movt\tip, 15" } } */
-+/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
-+/* { dg-final { scan-assembler "movw\tip, #65535" } } */
-+/* { dg-final { scan-assembler "movt\tip, 2047" } } */
-+/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */
-+/* { dg-final { scan-assembler "mov\tip, #1" } } */
-+/* { dg-final { scan-assembler "movt\tip, 65535" } } */
-+/* { dg-final { scan-assembler "and\tr2, r2, ip" } } */
-+/* { dg-final { scan-assembler "movw\tip, #65535" } } */
-+/* { dg-final { scan-assembler "movt\tip, 31" } } */
-+/* { dg-final { scan-assembler "and\tr3, r3, ip" } } */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-13.c
-@@ -0,0 +1,43 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */
-+/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
-+/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-sp-d16" }  */
-+
-+
-+int __attribute__ ((cmse_nonsecure_call)) (*bar) (float, double);
-+
-+int
-+foo (int a)
-+{
-+  return bar (3.0f, 2.0) + a + 1;
-+}
-+
-+/* Checks for saving and clearing prior to function call.  */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "mov\tr0, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr1, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
-+/* { dg-final { scan-assembler-not "vldr\.32\ts0, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts1, .L" } } */
-+/* { dg-final { scan-assembler-not "vldr\.32\ts2, .L" } } */
-+/* { dg-final { scan-assembler-not "vldr\.32\ts3, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts4, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts5, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts6, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts7, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts8, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts9, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts10, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts11, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts12, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts13, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts14, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts15, .L" } } */
-+
-+/* Now we check that we use the correct intrinsic to call.  */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-5.c
-@@ -0,0 +1,45 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */
-+/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
-+/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-sp-d16" }  */
-+
-+extern float bar (void);
-+
-+float __attribute__ ((cmse_nonsecure_entry))
-+foo (void)
-+{
-+  return bar ();
-+}
-+/* { dg-final { scan-assembler "mov\tr0, lr" } } */
-+/* { dg-final { scan-assembler "mov\tr1, lr" } } */
-+/* { dg-final { scan-assembler "mov\tr2, lr" } } */
-+/* { dg-final { scan-assembler "mov\tr3, lr" } } */
-+/* { dg-final { scan-assembler-not "vmov\.f32\ts0, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts1, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts2, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts3, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts4, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts5, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts6, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts7, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts8, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts9, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts10, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts11, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts12, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts13, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts14, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts15, #1\.0" } } */
-+/* { dg-final { scan-assembler "msr\tAPSR_nzcvq, lr" { target { arm_arch_v8m_main_ok && { ! arm_dsp } } } } } */
-+/* { dg-final { scan-assembler "msr\tAPSR_nzcvqg, lr" { target { arm_arch_v8m_main_ok && arm_dsp } } } } */
-+/* { dg-final { scan-assembler "push\t{r4}" } } */
-+/* { dg-final { scan-assembler "vmrs\tip, fpscr" } } */
-+/* { dg-final { scan-assembler "movw\tr4, #65376" } } */
-+/* { dg-final { scan-assembler "movt\tr4, #4095" } } */
-+/* { dg-final { scan-assembler "and\tip, r4" } } */
-+/* { dg-final { scan-assembler "vmsr\tfpscr, ip" } } */
-+/* { dg-final { scan-assembler "pop\t{r4}" } } */
-+/* { dg-final { scan-assembler "mov\tip, lr" } } */
-+/* { dg-final { scan-assembler "bxns" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-7.c
-@@ -0,0 +1,42 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */
-+/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
-+/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-sp-d16" }  */
-+
-+int __attribute__ ((cmse_nonsecure_call)) (*bar) (void);
-+
-+int
-+foo (int a)
-+{
-+  return bar () + a + 1;
-+}
-+
-+/* Checks for saving and clearing prior to function call.  */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "mov\tr0, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr1, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts0, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts1, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts2, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts3, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts4, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts5, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts6, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts7, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts8, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts9, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts10, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts11, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts12, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts13, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts14, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts15, .L" } } */
-+
-+/* Now we check that we use the correct intrinsic to call.  */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-8.c
-@@ -0,0 +1,41 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */
-+/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
-+/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-sp-d16" }  */
-+
-+int __attribute__ ((cmse_nonsecure_call)) (*bar) (double);
-+
-+int
-+foo (int a)
-+{
-+  return bar (2.0) + a + 1;
-+}
-+
-+/* Checks for saving and clearing prior to function call.  */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "mov\tr0, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr1, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
-+/* { dg-final { scan-assembler-not "vldr\.32\ts0, .L" } } */
-+/* { dg-final { scan-assembler-not "vldr\.32\ts1, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts2, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts3, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts4, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts5, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts6, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts7, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts8, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts9, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts10, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts11, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts12, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts13, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts14, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts15, .L" } } */
-+
-+/* Now we check that we use the correct intrinsic to call.  */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-13.c
-@@ -0,0 +1,38 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */
-+/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
-+/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-d16" }  */
-+
-+
-+int __attribute__ ((cmse_nonsecure_call)) (*bar) (float, double);
-+
-+int
-+foo (int a)
-+{
-+  return bar (3.0f, 2.0) + a + 1;
-+}
-+
-+/* Checks for saving and clearing prior to function call.  */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "mov\tr0, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr1, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
-+/* { dg-final { scan-assembler "vldr\.32\ts1, .L" } } */
-+/* { dg-final { scan-assembler-not "vldr\.64\td0, .L" } } */
-+/* { dg-final { scan-assembler-not "vldr\.32\ts0, .L" } } */
-+/* { dg-final { scan-assembler-not "vldr\.64\td1, .L" } } */
-+/* { dg-final { scan-assembler-not "vldr\.32\ts2, .L" } } */
-+/* { dg-final { scan-assembler-not "vldr\.32\ts3, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.64\td2, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.64\td3, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.64\td4, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.64\td5, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.64\td6, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.64\td7, .L" } } */
-+
-+/* Now we check that we use the correct intrinsic to call.  */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-5.c
-@@ -0,0 +1,38 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */
-+/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
-+/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-d16" }  */
-+
-+extern float bar (void);
-+
-+float __attribute__ ((cmse_nonsecure_entry))
-+foo (void)
-+{
-+  return bar ();
-+}
-+/* { dg-final { scan-assembler "mov\tr0, lr" } } */
-+/* { dg-final { scan-assembler "mov\tr1, lr" } } */
-+/* { dg-final { scan-assembler "mov\tr2, lr" } } */
-+/* { dg-final { scan-assembler "mov\tr3, lr" } } */
-+/* { dg-final { scan-assembler-not "vmov\.f32\ts0, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts1, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f64\td1, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f64\td2, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f64\td3, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f64\td4, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f64\td5, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f64\td6, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f64\td7, #1\.0" } } */
-+/* { dg-final { scan-assembler "msr\tAPSR_nzcvq, lr" { target { arm_arch_v8m_main_ok && { ! arm_dsp } } } } } */
-+/* { dg-final { scan-assembler "msr\tAPSR_nzcvqg, lr" { target { arm_arch_v8m_main_ok && arm_dsp } } } } */
-+/* { dg-final { scan-assembler "push\t{r4}" } } */
-+/* { dg-final { scan-assembler "vmrs\tip, fpscr" } } */
-+/* { dg-final { scan-assembler "movw\tr4, #65376" } } */
-+/* { dg-final { scan-assembler "movt\tr4, #4095" } } */
-+/* { dg-final { scan-assembler "and\tip, r4" } } */
-+/* { dg-final { scan-assembler "vmsr\tfpscr, ip" } } */
-+/* { dg-final { scan-assembler "pop\t{r4}" } } */
-+/* { dg-final { scan-assembler "mov\tip, lr" } } */
-+/* { dg-final { scan-assembler "bxns" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-7.c
-@@ -0,0 +1,34 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */
-+/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
-+/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-d16" }  */
-+
-+int __attribute__ ((cmse_nonsecure_call)) (*bar) (void);
-+
-+int
-+foo (int a)
-+{
-+  return bar () + a + 1;
-+}
-+
-+/* Checks for saving and clearing prior to function call.  */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "mov\tr0, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr1, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
-+/* { dg-final { scan-assembler "vldr\.64\td0, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.64\td1, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.64\td2, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.64\td3, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.64\td4, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.64\td5, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.64\td6, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.64\td7, .L" } } */
-+
-+/* Now we check that we use the correct intrinsic to call.  */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-8.c
-@@ -0,0 +1,33 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */
-+/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
-+/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-d16" }  */
-+
-+int __attribute__ ((cmse_nonsecure_call)) (*bar) (double);
-+
-+int
-+foo (int a)
-+{
-+  return bar (2.0) + a + 1;
-+}
-+
-+/* Checks for saving and clearing prior to function call.  */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "mov\tr0, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr1, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
-+/* { dg-final { scan-assembler-not "vldr\.64\td0, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.64\td1, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.64\td2, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.64\td3, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.64\td4, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.64\td5, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.64\td6, .L" } } */
-+/* { dg-final { scan-assembler "vldr\.64\td7, .L" } } */
-+
-+/* Now we check that we use the correct intrinsic to call.  */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-13.c
-@@ -0,0 +1,27 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=hard" -mfloat-abi=softfp } {""} } */
-+/* { dg-options "-mcmse -mfloat-abi=soft" }  */
-+
-+int __attribute__ ((cmse_nonsecure_call)) (*bar) (float, double);
-+
-+int
-+foo (int a)
-+{
-+  return bar (1.0f, 2.0) + a + 1;
-+}
-+
-+/* Checks for saving and clearing prior to function call.  */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr1, r4" } } */
-+/* { dg-final { scan-assembler-not "mov\tr2, r4" } } */
-+/* { dg-final { scan-assembler-not "mov\tr3, r4" } } */
-+/* { dg-final { scan-assembler-not "vmov" } } */
-+/* { dg-final { scan-assembler-not "vmsr" } } */
-+
-+/* Now we check that we use the correct intrinsic to call.  */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-5.c
-@@ -0,0 +1,24 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=hard" -mfloat-abi=softfp } {""} } */
-+/* { dg-options "-mcmse -mfloat-abi=soft" }  */
-+
-+extern float bar (void);
-+
-+float __attribute__ ((cmse_nonsecure_entry))
-+foo (void)
-+{
-+  return bar ();
-+}
-+
-+/* { dg-final { scan-assembler "mov\tr1, lr" } } */
-+/* { dg-final { scan-assembler "mov\tr2, lr" } } */
-+/* { dg-final { scan-assembler "mov\tr3, lr" } } */
-+/* { dg-final { scan-assembler "mov\tip, lr" } } */
-+/* { dg-final { scan-assembler-not "vmov" } } */
-+/* { dg-final { scan-assembler-not "vmsr" } } */
-+/* { dg-final { scan-assembler "msr\tAPSR_nzcvq, lr" { target { arm_arch_v8m_main_ok && { ! arm_dsp } } } } } */
-+/* { dg-final { scan-assembler "msr\tAPSR_nzcvqg, lr" { target { arm_arch_v8m_main_ok && arm_dsp } } } } */
-+/* { dg-final { scan-assembler "bxns" } } */
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-7.c
-@@ -0,0 +1,27 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=hard" -mfloat-abi=softfp } {""} } */
-+/* { dg-options "-mcmse -mfloat-abi=soft" }  */
-+
-+int __attribute__ ((cmse_nonsecure_call)) (*bar) (void);
-+
-+int
-+foo (int a)
-+{
-+  return bar () + a + 1;
-+}
-+
-+/* Checks for saving and clearing prior to function call.  */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "mov\tr0, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr1, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
-+/* { dg-final { scan-assembler-not "vmov" } } */
-+/* { dg-final { scan-assembler-not "vmsr" } } */
-+
-+/* Now we check that we use the correct intrinsic to call.  */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-8.c
-@@ -0,0 +1,26 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=hard" -mfloat-abi=softfp } {""} } */
-+/* { dg-options "-mcmse -mfloat-abi=soft" }  */
-+
-+int __attribute__ ((cmse_nonsecure_call)) (*bar) (double);
-+
-+int
-+foo (int a)
-+{
-+  return bar (2.0) + a + 1;
-+}
-+
-+/* Checks for saving and clearing prior to function call.  */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
-+/* { dg-final { scan-assembler-not "mov\tr1, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
-+/* { dg-final { scan-assembler-not "vmov" } } */
-+/* { dg-final { scan-assembler-not "vmsr" } } */
-+
-+/* Now we check that we use the correct intrinsic to call.  */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp-sp/cmse-5.c
-@@ -0,0 +1,46 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */
-+/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
-+/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-sp-d16" }  */
-+
-+extern float bar (void);
-+
-+float __attribute__ ((cmse_nonsecure_entry))
-+foo (void)
-+{
-+  return bar ();
-+}
-+/* { dg-final { scan-assembler "__acle_se_foo:" } } */
-+/* { dg-final { scan-assembler-not "mov\tr0, lr" } } */
-+/* { dg-final { scan-assembler "mov\tr1, lr" } } */
-+/* { dg-final { scan-assembler "mov\tr2, lr" } } */
-+/* { dg-final { scan-assembler "mov\tr3, lr" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts0, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts1, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts2, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts3, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts4, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts5, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts6, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts7, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts8, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts9, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts10, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts11, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts12, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts13, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts14, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f32\ts15, #1\.0" } } */
-+/* { dg-final { scan-assembler "msr\tAPSR_nzcvq, lr" { target { arm_arch_v8m_main_ok && { ! arm_dsp } } } } } */
-+/* { dg-final { scan-assembler "msr\tAPSR_nzcvqg, lr" { target { arm_arch_v8m_main_ok && arm_dsp } } } } */
-+/* { dg-final { scan-assembler "push\t{r4}" } } */
-+/* { dg-final { scan-assembler "vmrs\tip, fpscr" } } */
-+/* { dg-final { scan-assembler "movw\tr4, #65376" } } */
-+/* { dg-final { scan-assembler "movt\tr4, #4095" } } */
-+/* { dg-final { scan-assembler "and\tip, r4" } } */
-+/* { dg-final { scan-assembler "vmsr\tfpscr, ip" } } */
-+/* { dg-final { scan-assembler "pop\t{r4}" } } */
-+/* { dg-final { scan-assembler "mov\tip, lr" } } */
-+/* { dg-final { scan-assembler "bxns" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp-sp/cmse-7.c
-@@ -0,0 +1,26 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */
-+/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
-+/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-sp-d16" }  */
-+
-+int __attribute__ ((cmse_nonsecure_call)) (*bar) (void);
-+
-+int
-+foo (int a)
-+{
-+  return bar () + a + 1;
-+}
-+
-+/* Checks for saving and clearing prior to function call.  */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "mov\tr0, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr1, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
-+
-+/* Now we check that we use the correct intrinsic to call.  */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp-sp/cmse-8.c
-@@ -0,0 +1,25 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */
-+/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
-+/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-sp-d16" }  */
-+
-+int __attribute__ ((cmse_nonsecure_call)) (*bar) (double);
-+
-+int
-+foo (int a)
-+{
-+  return bar (2.0) + a + 1;
-+}
-+
-+/* Checks for saving and clearing prior to function call.  */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
-+/* { dg-final { scan-assembler-not "mov\tr1, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
-+
-+/* Now we check that we use the correct intrinsic to call.  */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-13.c
-@@ -0,0 +1,25 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */
-+/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
-+/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-d16" }  */
-+
-+int __attribute__ ((cmse_nonsecure_call)) (*bar) (float, double);
-+
-+int
-+foo (int a)
-+{
-+  return bar (1.0f, 2.0) + a + 1;
-+}
-+
-+/* Checks for saving and clearing prior to function call.  */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
-+/* { dg-final { scan-assembler "\n\tmov\tr1, r4" } } */
-+/* { dg-final { scan-assembler-not "\n\tmov\tr2, r4\n\tmov\tr3, r4" } } */
-+
-+/* Now we check that we use the correct intrinsic to call.  */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-5.c
-@@ -0,0 +1,38 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */
-+/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
-+/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-d16" }  */
-+
-+extern float bar (void);
-+
-+float __attribute__ ((cmse_nonsecure_entry))
-+foo (void)
-+{
-+  return bar ();
-+}
-+/* { dg-final { scan-assembler "__acle_se_foo:" } } */
-+/* { dg-final { scan-assembler-not "mov\tr0, lr" } } */
-+/* { dg-final { scan-assembler "mov\tr1, lr" } } */
-+/* { dg-final { scan-assembler "mov\tr2, lr" } } */
-+/* { dg-final { scan-assembler "mov\tr3, lr" } } */
-+/* { dg-final { scan-assembler "vmov\.f64\td0, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f64\td1, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f64\td2, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f64\td3, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f64\td4, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f64\td5, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f64\td6, #1\.0" } } */
-+/* { dg-final { scan-assembler "vmov\.f64\td7, #1\.0" } } */
-+/* { dg-final { scan-assembler "msr\tAPSR_nzcvq, lr" { target { arm_arch_v8m_main_ok && { ! arm_dsp } } } } } */
-+/* { dg-final { scan-assembler "msr\tAPSR_nzcvqg, lr" { target { arm_arch_v8m_main_ok && arm_dsp } } } } */
-+/* { dg-final { scan-assembler "push\t{r4}" } } */
-+/* { dg-final { scan-assembler "vmrs\tip, fpscr" } } */
-+/* { dg-final { scan-assembler "movw\tr4, #65376" } } */
-+/* { dg-final { scan-assembler "movt\tr4, #4095" } } */
-+/* { dg-final { scan-assembler "and\tip, r4" } } */
-+/* { dg-final { scan-assembler "vmsr\tfpscr, ip" } } */
-+/* { dg-final { scan-assembler "pop\t{r4}" } } */
-+/* { dg-final { scan-assembler "mov\tip, lr" } } */
-+/* { dg-final { scan-assembler "bxns" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-7.c
-@@ -0,0 +1,26 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */
-+/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
-+/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-d16" }  */
-+
-+int __attribute__ ((cmse_nonsecure_call)) (*bar) (void);
-+
-+int
-+foo (int a)
-+{
-+  return bar () + a + 1;
-+}
-+
-+/* Checks for saving and clearing prior to function call.  */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "mov\tr0, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr1, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
-+
-+/* Now we check that we use the correct intrinsic to call.  */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-8.c
-@@ -0,0 +1,25 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v8m_main_ok } */
-+/* { dg-add-options arm_arch_v8m_main } */
-+/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */
-+/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
-+/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-d16" }  */
-+
-+int __attribute__ ((cmse_nonsecure_call)) (*bar) (double);
-+
-+int
-+foo (int a)
-+{
-+  return bar (2.0) + a + 1;
-+}
-+
-+/* Checks for saving and clearing prior to function call.  */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
-+/* { dg-final { scan-assembler-not "mov\tr1, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
-+
-+/* Now we check that we use the correct intrinsic to call.  */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/union-1.c
-@@ -0,0 +1,69 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mcmse" } */
-+
-+typedef struct
-+{
-+  unsigned char	  a :2;
-+  unsigned char	    :0;
-+  unsigned short  b :5;
-+  unsigned char	    :0;
-+  unsigned short  c :3;
-+  unsigned char	    :0;
-+  unsigned int	  d :9;
-+} test_st_1;
-+
-+typedef struct
-+{
-+  unsigned short  a :7;
-+  unsigned char	    :0;
-+  unsigned char	  b :1;
-+  unsigned char	    :0;
-+  unsigned short  c :6;
-+} test_st_2;
-+
-+typedef union
-+{
-+  test_st_1 st_1;
-+  test_st_2 st_2;
-+}test_un;
-+
-+typedef union
-+{
-+  test_un un;
-+  struct
-+    {
-+      unsigned int v1;
-+      unsigned int v2;
-+      unsigned int v3;
-+      unsigned int v4;
-+    }values;
-+} read_un;
-+
-+
-+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_un);
-+
-+int
-+main (void)
-+{
-+  read_un r;
-+  foo_ns f;
-+
-+  f = (foo_ns) 0x200000;
-+  r.values.v1 = 0xFFFFFFFF;
-+  r.values.v2 = 0xFFFFFFFF;
-+
-+  f (r.un);
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler "movw\tip, #8063" } } */
-+/* { dg-final { scan-assembler "movt\tip, 63" } } */
-+/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
-+/* { dg-final { scan-assembler "movw\tip, #511" } } */
-+/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "mov\tr2, r4" } } */
-+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/mainline/union-2.c
-@@ -0,0 +1,84 @@
-+/* { dg-do compile } */
-+/* { dg-options "-mcmse" } */
-+
-+typedef struct
-+{
-+  unsigned char	  a :2;
-+  unsigned char	    :0;
-+  unsigned short  b :5;
-+  unsigned char	    :0;
-+  unsigned short  c :3;
-+  unsigned char	    :0;
-+  unsigned int	  d :9;
-+} test_st_1;
-+
-+typedef struct
-+{
-+  unsigned short  a :7;
-+  unsigned char	    :0;
-+  unsigned char	  b :1;
-+  unsigned char	    :0;
-+  unsigned short  c :6;
-+} test_st_2;
-+
-+typedef struct
-+{
-+  unsigned char	  a;
-+  unsigned int	    :0;
-+  unsigned int	  b :1;
-+  unsigned short    :0;
-+  unsigned short  c;
-+  unsigned int	    :0;
-+  unsigned int	  d :21;
-+} test_st_3;
-+
-+typedef union
-+{
-+  test_st_1 st_1;
-+  test_st_2 st_2;
-+  test_st_3 st_3;
-+}test_un;
-+
-+typedef union
-+{
-+  test_un un;
-+  struct
-+    {
-+      unsigned int v1;
-+      unsigned int v2;
-+      unsigned int v3;
-+      unsigned int v4;
-+    }values;
-+} read_un;
-+
-+
-+typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_un);
-+
-+int
-+main (void)
-+{
-+  read_un r;
-+  foo_ns f;
-+
-+  f = (foo_ns) 0x200000;
-+  r.values.v1 = 0xFFFFFFFF;
-+  r.values.v2 = 0xFFFFFFFF;
-+  r.values.v3 = 0xFFFFFFFF;
-+
-+  f (r.un);
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler "movw\tip, #8191" } } */
-+/* { dg-final { scan-assembler "movt\tip, 63" } } */
-+/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */
-+/* { dg-final { scan-assembler "movw\tip, #511" } } */
-+/* { dg-final { scan-assembler "movt\tip, 65535" } } */
-+/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */
-+/* { dg-final { scan-assembler "movw\tip, #65535" } } */
-+/* { dg-final { scan-assembler "movt\tip, 31" } } */
-+/* { dg-final { scan-assembler "and\tr2, r2, ip" } } */
-+/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */
-+/* { dg-final { scan-assembler "mov\tr3, r4" } } */
-+/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/cmse/struct-1.c
-@@ -0,0 +1,33 @@
-+/* { dg-do run } */
-+/* { dg-options "--save-temps -mcmse -Wl,--section-start,.gnu.sgstubs=0x20400000" } */
-+
-+typedef struct
-+{
-+  unsigned char	  a;
-+  unsigned short  b;
-+} test_st;
-+
-+test_st __attribute__ ((cmse_nonsecure_entry)) foo (void)
-+{
-+  test_st t;
-+  t.a = 255u;
-+  t.b = 32767u;
-+  return t;
-+}
-+
-+int
-+main (void)
-+{
-+  test_st t;
-+  t = foo ();
-+  if (t.a != 255u || t.b != 32767u)
-+    __builtin_abort ();
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler "movs\tr1, #255" } } */
-+/* { dg-final { scan-assembler "movt\tr1, 65535" } } */
-+/* { dg-final { scan-assembler "ands\tr0(, r0)?, r1" } } */
-+/* { dg-final { scan-assembler "bxns" } } */
-+
-+
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/data-rel-1.c
-@@ -0,0 +1,12 @@
-+/* { dg-options "-fPIC -mno-pic-data-is-text-relative" } */
-+/* { dg-final { scan-assembler-not "j-\\(.LPIC"  } } */
-+/* { dg-final { scan-assembler-not "_GLOBAL_OFFSET_TABLE_-\\(.LPIC" } } */
-+/* { dg-final { scan-assembler "j\\(GOT\\)" } } */
-+/* { dg-final { scan-assembler "(ldr|mov)\tr\[0-9\]+, \\\[?r9" } } */
-+
-+static int j;
-+
-+int *Foo ()
-+{
-+  return &j;
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/data-rel-2.c
-@@ -0,0 +1,11 @@
-+/* { dg-options "-fPIC -mno-pic-data-is-text-relative -mno-single-pic-base" } */
-+/* { dg-final { scan-assembler-not "j-\\(.LPIC"  } } */
-+/* { dg-final { scan-assembler "_GLOBAL_OFFSET_TABLE_-\\(.LPIC" } } */
-+/* { dg-final { scan-assembler "j\\(GOT\\)" } } */
-+
-+static int j;
-+
-+int *Foo ()
-+{
-+  return &j;
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/data-rel-3.c
-@@ -0,0 +1,11 @@
-+/* { dg-options "-fPIC -mpic-data-is-text-relative" } */
-+/* { dg-final { scan-assembler "j-\\(.LPIC"  } } */
-+/* { dg-final { scan-assembler-not "_GLOBAL_OFFSET_TABLE_-\\(.LPIC" } } */
-+/* { dg-final { scan-assembler-not "j\\(GOT\\)" } } */
-+
-+static int j;
-+
-+int *Foo ()
-+{
-+  return &j;
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/fp16-aapcs-1.c
-@@ -0,0 +1,21 @@
-+/* { dg-do compile }  */
-+/* { dg-require-effective-target arm_hard_vfp_ok }  */
-+/* { dg-require-effective-target arm_fp16_ok } */
-+/* { dg-options "-O2" }  */
-+/* { dg-add-options arm_fp16_ieee } */
-+
-+/* Test __fp16 arguments and return value in registers (hard-float).  */
-+
-+void
-+swap (__fp16, __fp16);
-+
-+__fp16
-+F (__fp16 a, __fp16 b, __fp16 c)
-+{
-+  swap (b, a);
-+  return c;
-+}
-+
-+/* { dg-final { scan-assembler {vmov(\.f16)?\tr[0-9]+, s[0-9]+} } }  */
-+/* { dg-final { scan-assembler {vmov(\.f32)?\ts1, s0} } }  */
-+/* { dg-final { scan-assembler {vmov(\.f16)?\ts0, r[0-9]+} } }  */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/fp16-aapcs-2.c
-@@ -0,0 +1,21 @@
-+/* { dg-do compile }  */
-+/* { dg-require-effective-target arm_fp16_ok } */
-+/* { dg-options "-mfloat-abi=softfp -O2" }  */
-+/* { dg-add-options arm_fp16_ieee } */
-+/* { dg-skip-if "incompatible float-abi" { arm*-*-* } { "-mfloat-abi=hard" } } */
-+
-+/* Test __fp16 arguments and return value in registers (softfp).  */
-+
-+void
-+swap (__fp16, __fp16);
-+
-+__fp16
-+F (__fp16 a, __fp16 b, __fp16 c)
-+{
-+  swap (b, a);
-+  return c;
-+}
-+
-+/* { dg-final { scan-assembler-times {mov\tr[0-9]+, r[0-2]} 3 } }  */
-+/* { dg-final { scan-assembler-times {mov\tr1, r0} 1 } }  */
-+/* { dg-final { scan-assembler-times {mov\tr0, r[0-9]+} 2 } }  */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/fp16-aapcs-3.c
-@@ -0,0 +1,21 @@
-+/* { dg-do compile }  */
-+/* { dg-require-effective-target arm_hard_vfp_ok }  */
-+/* { dg-require-effective-target arm_fp16_ok } */
-+/* { dg-options "-O2" }  */
-+/* { dg-add-options arm_fp16_alternative } */
-+
-+/* Test __fp16 arguments and return value in registers (hard-float).  */
-+
-+void
-+swap (__fp16, __fp16);
-+
-+__fp16
-+F (__fp16 a, __fp16 b, __fp16 c)
-+{
-+  swap (b, a);
-+  return c;
-+}
-+
-+/* { dg-final { scan-assembler-times {vmov\tr[0-9]+, s[0-2]} 2 } }  */
-+/* { dg-final { scan-assembler-times {vmov.f32\ts1, s0} 1 } }  */
-+/* { dg-final { scan-assembler-times {vmov\ts0, r[0-9]+} 2 } }  */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/fp16-aapcs-4.c
-@@ -0,0 +1,21 @@
-+/* { dg-do compile }  */
-+/* { dg-require-effective-target arm_fp16_ok } */
-+/* { dg-options "-mfloat-abi=softfp -O2" }  */
-+/* { dg-add-options arm_fp16_alternative } */
-+/* { dg-skip-if "incompatible float-abi" { arm*-*-* } { "-mfloat-abi=hard" } } */
-+
-+/* Test __fp16 arguments and return value in registers (softfp).  */
-+
-+void
-+swap (__fp16, __fp16);
-+
-+__fp16
-+F (__fp16 a, __fp16 b, __fp16 c)
-+{
-+  swap (b, a);
-+  return c;
-+}
-+
-+/* { dg-final { scan-assembler-times {mov\tr[0-9]+, r[0-2]} 3 } }  */
-+/* { dg-final { scan-assembler-times {mov\tr1, r0} 1 } }  */
-+/* { dg-final { scan-assembler-times {mov\tr0, r[0-9]+} 2 } }  */
---- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-1.c
-+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-1.c
-@@ -1,4 +1,5 @@
- /* { dg-do compile } */
-+/* { dg-require-effective-target arm_fp16_alternative_ok } */
- /* { dg-options "-mfp16-format=alternative" } */
- 
- __fp16 xx = 0.0;
---- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-10.c
-+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-10.c
-@@ -1,4 +1,5 @@
- /* { dg-do compile } */
-+/* { dg-require-effective-target arm_fp16_alternative_ok } */
- /* { dg-options "-mfp16-format=alternative -pedantic -std=gnu99" } */
- 
- #include <math.h>
---- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-11.c
-+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-11.c
-@@ -1,4 +1,5 @@
- /* { dg-do compile } */
-+/* { dg-require-effective-target arm_fp16_alternative_ok } */
- /* { dg-options "-mfp16-format=alternative -pedantic -std=gnu99" } */
- 
- #include <math.h>
---- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-12.c
-+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-12.c
-@@ -1,4 +1,5 @@
- /* { dg-do compile } */
-+/* { dg-require-effective-target arm_fp16_alternative_ok } */
- /* { dg-options "-mfp16-format=alternative" } */
- 
- float xx __attribute__((mode(HF))) = 0.0;
---- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-2.c
-+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-2.c
-@@ -1,4 +1,5 @@
- /* { dg-do compile } */
-+/* { dg-require-effective-target arm_fp16_alternative_ok } */
- /* { dg-options "-mfp16-format=alternative" } */
- 
- /* Encoding taken from:  http://en.wikipedia.org/wiki/Half_precision */
---- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-3.c
-+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-3.c
-@@ -1,4 +1,5 @@
- /* { dg-do compile } */
-+/* { dg-require-effective-target arm_fp16_alternative_ok } */
- /* { dg-options "-mfp16-format=alternative" } */
- 
- /* Encoding taken from:  http://en.wikipedia.org/wiki/Half_precision */
---- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-4.c
-+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-4.c
-@@ -1,4 +1,5 @@
- /* { dg-do compile } */
-+/* { dg-require-effective-target arm_fp16_alternative_ok } */
- /* { dg-options "-mfp16-format=alternative" } */
- 
- /* Encoding taken from:  http://en.wikipedia.org/wiki/Half_precision */
---- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-5.c
-+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-5.c
-@@ -1,4 +1,5 @@
- /* { dg-do compile } */
-+/* { dg-require-effective-target arm_fp16_alternative_ok } */
- /* { dg-options "-mfp16-format=alternative" } */
- 
- /* Encoding taken from:  http://en.wikipedia.org/wiki/Half_precision */
---- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-6.c
-+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-6.c
-@@ -1,4 +1,5 @@
- /* { dg-do compile } */
-+/* { dg-require-effective-target arm_fp16_alternative_ok } */
- /* { dg-options "-mfp16-format=alternative" } */
- 
- /* This number is the maximum value representable in the alternative
---- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-7.c
-+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-7.c
-@@ -1,4 +1,5 @@
- /* { dg-do compile } */
-+/* { dg-require-effective-target arm_fp16_alternative_ok } */
- /* { dg-options "-mfp16-format=alternative -pedantic" } */
- 
- /* This number overflows the range of the alternative encoding.  Since this
---- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-8.c
-+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-8.c
-@@ -1,4 +1,5 @@
- /* { dg-do compile } */
-+/* { dg-require-effective-target arm_fp16_alternative_ok } */
- /* { dg-options "-mfp16-format=alternative" } */
- 
- /* Encoding taken from:  http://en.wikipedia.org/wiki/Half_precision */
---- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-9.c
-+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-alt-9.c
-@@ -1,4 +1,5 @@
- /* { dg-do compile } */
-+/* { dg-require-effective-target arm_fp16_alternative_ok } */
- /* { dg-options "-mfp16-format=alternative" } */
- 
- /* Encoding taken from:  http://en.wikipedia.org/wiki/Half_precision */
---- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-none-1.c
-+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-none-1.c
-@@ -1,4 +1,5 @@
- /* { dg-do compile } */
-+/* { dg-require-effective-target arm_fp16_none_ok } */
- /* { dg-options "-mfp16-format=none" } */
- 
- /* __fp16 type name is not recognized unless you explicitly enable it
---- a/src/gcc/testsuite/gcc.target/arm/fp16-compile-none-2.c
-+++ b/src/gcc/testsuite/gcc.target/arm/fp16-compile-none-2.c
-@@ -1,4 +1,5 @@
- /* { dg-do compile } */
-+/* { dg-require-effective-target arm_fp16_none_ok } */
- /* { dg-options "-mfp16-format=none" } */
- 
- /* mode(HF) attributes are not recognized unless you explicitly enable
---- a/src/gcc/testsuite/gcc.target/arm/fp16-param-1.c
-+++ b/src/gcc/testsuite/gcc.target/arm/fp16-param-1.c
-@@ -1,10 +1,14 @@
- /* { dg-do compile } */
- /* { dg-options "-mfp16-format=ieee" } */
- 
--/* Functions cannot have parameters of type __fp16.  */
--extern void f (__fp16);		/* { dg-error "parameters cannot have __fp16 type" } */
--extern void (*pf) (__fp16);	/* { dg-error "parameters cannot have __fp16 type" } */
-+/* Test that the ACLE macro is defined.  */
-+#if __ARM_FP16_ARGS != 1
-+#error Unexpected value for __ARM_FP16_ARGS
-+#endif
-+
-+/* Test that __fp16 is supported as a parameter type.  */
-+extern void f (__fp16);
-+extern void (*pf) (__fp16);
- 
--/* These should be OK.  */
- extern void g (__fp16 *);
- extern void (*pg) (__fp16 *);
---- a/src/gcc/testsuite/gcc.target/arm/fp16-return-1.c
-+++ b/src/gcc/testsuite/gcc.target/arm/fp16-return-1.c
-@@ -1,10 +1,9 @@
- /* { dg-do compile } */
- /* { dg-options "-mfp16-format=ieee" } */
- 
--/* Functions cannot return type __fp16.  */
--extern __fp16 f (void);		/* { dg-error "cannot return __fp16" } */
--extern __fp16 (*pf) (void);	/* { dg-error "cannot return __fp16" } */
-+/* Test that __fp16 is supported as a return type.  */
-+extern __fp16 f (void);
-+extern __fp16 (*pf) (void);
- 
--/* These should be OK.  */
- extern __fp16 *g (void);
- extern __fp16 *(*pg) (void);
---- a/src/gcc/testsuite/gcc.target/arm/fp16-rounding-alt-1.c
-+++ b/src/gcc/testsuite/gcc.target/arm/fp16-rounding-alt-1.c
-@@ -3,6 +3,7 @@
-    from double to __fp16.  */
- 
- /* { dg-do run } */
-+/* { dg-require-effective-target arm_fp16_alternative_ok } */
- /* { dg-options "-mfp16-format=alternative" } */
- 
- #include <stdlib.h>
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/movdi_movw.c
-@@ -0,0 +1,12 @@
-+/* { dg-do compile { target { arm_thumb2_ok || arm_thumb1_movt_ok } } } */
-+/* { dg-options "-O2" } */
-+
-+long long
-+movdi (int a)
-+{
-+  return 0xF0F0;
-+}
-+
-+/* Accept r1 because big endian targets put the low bits in the highest
-+   numbered register of a pair.  */
-+/* { dg-final { scan-assembler-times "movw\tr\[01\], #61680" 1 } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/movhi_movw.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile { target { arm_thumb2_ok || arm_thumb1_movt_ok } } } */
-+/* { dg-options "-O2" } */
-+
-+short
-+movsi (void)
-+{
-+  return (short) 0x7070;
-+}
-+
-+/* { dg-final { scan-assembler-times "movw\tr0, #28784" 1 } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/movsi_movw.c
-@@ -0,0 +1,10 @@
-+/* { dg-do compile { target { arm_thumb2_ok || arm_thumb1_movt_ok } } } */
-+/* { dg-options "-O2" } */
-+
-+int
-+movsi (void)
-+{
-+  return 0xF0F0;
-+}
-+
-+/* { dg-final { scan-assembler-times "movw\tr0, #61680" 1 } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
-@@ -0,0 +1,19 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_neon_ok } */
-+/* { dg-options "-O3" } */
-+/* { dg-add-options arm_neon } */
-+
-+
-+
-+int
-+t6 (int len, void * dummy, short * __restrict x)
-+{
-+  len = len & ~31;
-+  int result = 0;
-+  __asm volatile ("");
-+  for (int i = 0; i < len; i++)
-+    result += x[i];
-+  return result;
-+}
-+
-+/* { dg-final { scan-assembler "vaddw\.s16" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
-@@ -0,0 +1,18 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_neon_ok } */
-+/* { dg-options "-O3" } */
-+/* { dg-add-options arm_neon } */
-+
-+
-+int
-+t6 (int len, void * dummy, int * __restrict x)
-+{
-+  len = len & ~31;
-+  long long result = 0;
-+  __asm volatile ("");
-+  for (int i = 0; i < len; i++)
-+    result += x[i];
-+  return result;
-+}
-+
-+/* { dg-final { scan-assembler "vaddw\.s32" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
-@@ -0,0 +1,18 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_neon_ok } */
-+/* { dg-options "-O3" } */
-+/* { dg-add-options arm_neon } */
-+
-+
-+int
-+t6 (int len, void * dummy, unsigned short * __restrict x)
-+{
-+  len = len & ~31;
-+  unsigned int result = 0;
-+  __asm volatile ("");
-+  for (int i = 0; i < len; i++)
-+    result += x[i];
-+  return result;
-+}
-+
-+/* { dg-final { scan-assembler "vaddw.u16" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
-@@ -0,0 +1,18 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_neon_ok } */
-+/* { dg-options "-O3" } */
-+/* { dg-add-options arm_neon } */
-+
-+
-+int
-+t6 (int len, void * dummy, unsigned int * __restrict x)
-+{
-+  len = len & ~31;
-+  unsigned long long result = 0;
-+  __asm volatile ("");
-+  for (int i = 0; i < len; i++)
-+    result += x[i];
-+  return result;
-+}
-+
-+/* { dg-final { scan-assembler "vaddw\.u32" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
-@@ -0,0 +1,19 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_neon_ok } */
-+/* { dg-options "-O3" } */
-+/* { dg-add-options arm_neon } */
-+
-+
-+
-+int
-+t6 (int len, void * dummy, char * __restrict x)
-+{
-+  len = len & ~31;
-+  unsigned short result = 0;
-+  __asm volatile ("");
-+  for (int i = 0; i < len; i++)
-+    result += x[i];
-+  return result;
-+}
-+
-+/* { dg-final { scan-assembler "vaddw\.u8" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/neon.exp
-+++ b/src//dev/null
-@@ -1,35 +0,0 @@
--# Copyright (C) 1997-2016 Free Software Foundation, Inc.
--
--# This program is free software; you can redistribute it and/or modify
--# it under the terms of the GNU General Public License as published by
--# the Free Software Foundation; either version 3 of the License, or
--# (at your option) any later version.
--#
--# This program is distributed in the hope that it will be useful,
--# but WITHOUT ANY WARRANTY; without even the implied warranty of
--# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
--# GNU General Public License for more details.
--#
--# You should have received a copy of the GNU General Public License
--# along with GCC; see the file COPYING3.  If not see
--# <http://www.gnu.org/licenses/>.
--
--# GCC testsuite that uses the `dg.exp' driver.
--
--# Exit immediately if this isn't an ARM target.
--if ![istarget arm*-*-*] then {
--  return
--}
--
--# Load support procs.
--load_lib gcc-dg.exp
--
--# Initialize `dg'.
--dg-init
--
--# Main loop.
--dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.\[cCS\]]] \
--	"" ""
--
--# All done.
--dg-finish
---- a/src/gcc/testsuite/gcc.target/arm/neon/polytypes.c
-+++ b/src//dev/null
-@@ -1,48 +0,0 @@
--/* Check that NEON polynomial vector types are suitably incompatible with
--   integer vector types of the same layout.  */
--
--/* { dg-do compile } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-add-options arm_neon } */
--
--#include <arm_neon.h>
--
--void s64_8 (int8x8_t a) {}
--void u64_8 (uint8x8_t a) {}
--void p64_8 (poly8x8_t a) {}
--void s64_16 (int16x4_t a) {}
--void u64_16 (uint16x4_t a) {}
--void p64_16 (poly16x4_t a) {}
--
--void s128_8 (int8x16_t a) {}
--void u128_8 (uint8x16_t a) {}
--void p128_8 (poly8x16_t a) {}
--void s128_16 (int16x8_t a) {}
--void u128_16 (uint16x8_t a) {}
--void p128_16 (poly16x8_t a) {}
--
--void foo ()
--{
--  poly8x8_t v64_8;
--  poly16x4_t v64_16;
--  poly8x16_t v128_8;
--  poly16x8_t v128_16;
--
--  s64_8 (v64_8); /* { dg-message "use -flax-vector-conversions" } */
--  /* { dg-error "incompatible type for argument 1 of 's64_8'" "" { target *-*-* } 31 } */
--  u64_8 (v64_8); /* { dg-error "incompatible type for argument 1 of 'u64_8'" } */
--  p64_8 (v64_8);
--
--  s64_16 (v64_16); /* { dg-error "incompatible type for argument 1 of 's64_16'" } */
--  u64_16 (v64_16); /* { dg-error "incompatible type for argument 1 of 'u64_16'" } */
--  p64_16 (v64_16);
--
--  s128_8 (v128_8); /* { dg-error "incompatible type for argument 1 of 's128_8'" } */
--  u128_8 (v128_8); /* { dg-error "incompatible type for argument 1 of 'u128_8'" } */
--  p128_8 (v128_8);
--
--  s128_16 (v128_16); /* { dg-error "incompatible type for argument 1 of 's128_16'" } */
--  u128_16 (v128_16); /* { dg-error "incompatible type for argument 1 of 'u128_16'" } */
--  p128_16 (v128_16);
--}
--/* { dg-message "note: expected '\[^'\n\]*' but argument is of type '\[^'\n\]*'" "note: expected" { target *-*-* } 0 } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/pr51534.c
-+++ b/src//dev/null
-@@ -1,83 +0,0 @@
--/* Test the vector comparison intrinsics when comparing to immediate zero.
--   */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -mfloat-abi=hard -O3" } */
--/* { dg-add-options arm_neon } */
--
--#include <arm_neon.h>
--
--#define GEN_TEST(T, D, C, R) \
--  R test_##C##_##T (T a) { return C (a, D (0)); }
--
--#define GEN_DOUBLE_TESTS(S, T, C) \
--  GEN_TEST (T, vdup_n_s##S, C##_s##S, u##T) \
--  GEN_TEST (u##T, vdup_n_u##S, C##_u##S, u##T) 
--
--#define GEN_QUAD_TESTS(S, T, C) \
--  GEN_TEST (T, vdupq_n_s##S, C##q_s##S, u##T) \
--  GEN_TEST (u##T, vdupq_n_u##S, C##q_u##S, u##T) 
--
--#define GEN_COND_TESTS(C) \
--  GEN_DOUBLE_TESTS (8, int8x8_t, C) \
--  GEN_DOUBLE_TESTS (16, int16x4_t, C) \
--  GEN_DOUBLE_TESTS (32, int32x2_t, C) \
--  GEN_QUAD_TESTS (8, int8x16_t, C) \
--  GEN_QUAD_TESTS (16, int16x8_t, C) \
--  GEN_QUAD_TESTS (32, int32x4_t, C)
--
--GEN_COND_TESTS(vcgt)
--GEN_COND_TESTS(vcge)
--GEN_COND_TESTS(vclt)
--GEN_COND_TESTS(vcle)
--GEN_COND_TESTS(vceq)
--
--/* Scan for expected outputs.  */
--/* { dg-final { scan-assembler "vcgt\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
--/* { dg-final { scan-assembler-times "vcgt\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+" 2 } } */
--/* { dg-final { scan-assembler "vcgt\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
--/* { dg-final { scan-assembler-times "vcgt\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+" 2 } } */
--/* { dg-final { scan-assembler "vcgt\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
--/* { dg-final { scan-assembler-times "vcgt\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+" 2 } } */
--/* { dg-final { scan-assembler "vcgt\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
--/* { dg-final { scan-assembler-times "vcgt\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+" 2 } } */
--/* { dg-final { scan-assembler "vcgt\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
--/* { dg-final { scan-assembler-times "vcgt\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+" 2 } } */
--/* { dg-final { scan-assembler "vcgt\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
--/* { dg-final { scan-assembler-times "vcgt\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+" 2 } } */
--/* { dg-final { scan-assembler "vcge\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
--/* { dg-final { scan-assembler-times "vcge\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+" 2 } } */
--/* { dg-final { scan-assembler "vcge\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
--/* { dg-final { scan-assembler-times "vcge\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+" 2 } } */
--/* { dg-final { scan-assembler "vcge\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
--/* { dg-final { scan-assembler-times "vcge\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+" 2 } } */
--/* { dg-final { scan-assembler "vcge\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
--/* { dg-final { scan-assembler-times "vcge\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+" 2 } } */
--/* { dg-final { scan-assembler "vcge\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
--/* { dg-final { scan-assembler-times "vcge\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+" 2 } } */
--/* { dg-final { scan-assembler "vcge\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
--/* { dg-final { scan-assembler-times "vcge\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+" 2 } } */
--/* { dg-final { scan-assembler "vclt\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
--/* { dg-final { scan-assembler "vclt\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
--/* { dg-final { scan-assembler "vclt\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
--/* { dg-final { scan-assembler "vclt\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
--/* { dg-final { scan-assembler "vclt\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
--/* { dg-final { scan-assembler "vclt\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
--/* { dg-final { scan-assembler "vcle\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
--/* { dg-final { scan-assembler "vcle\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
--/* { dg-final { scan-assembler "vcle\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
--/* { dg-final { scan-assembler "vcle\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
--/* { dg-final { scan-assembler "vcle\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
--/* { dg-final { scan-assembler "vcle\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
--/* { dg-final { scan-assembler-times "vceq\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" 2 } } */
--/* { dg-final { scan-assembler-times "vceq\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" 2 } } */
--/* { dg-final { scan-assembler-times "vceq\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" 2 } } */
--/* { dg-final { scan-assembler-times "vceq\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" 2 } } */
--/* { dg-final { scan-assembler-times "vceq\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" 2 } } */
--/* { dg-final { scan-assembler-times "vceq\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" 2 } } */
--
--/* And ensure we don't have unexpected output too.  */
--/* { dg-final { scan-assembler-not "vc\[gl\]\[te\]\.u\[0-9\]+\[ 	\]+\[qQdD\]\[0-9\]+, \[qQdD\]\[0-9\]+, #0" } } */
--
--/* Tidy up.  */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRaddhns16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRaddhns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRaddhns16 (void)
--{
--  int8x8_t out_int8x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int8x8_t = vraddhn_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vraddhn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRaddhns32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRaddhns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRaddhns32 (void)
--{
--  int16x4_t out_int16x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int16x4_t = vraddhn_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vraddhn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRaddhns64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRaddhns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRaddhns64 (void)
--{
--  int32x2_t out_int32x2_t;
--  int64x2_t arg0_int64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_int32x2_t = vraddhn_s64 (arg0_int64x2_t, arg1_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "vraddhn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRaddhnu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRaddhnu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRaddhnu16 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint8x8_t = vraddhn_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vraddhn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRaddhnu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRaddhnu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRaddhnu32 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint16x4_t = vraddhn_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vraddhn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRaddhnu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRaddhnu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRaddhnu64 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint64x2_t arg1_uint64x2_t;
--
--  out_uint32x2_t = vraddhn_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
--}
--
--/* { dg-final { scan-assembler "vraddhn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRhaddQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRhaddQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRhaddQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vrhaddq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vrhadd\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRhaddQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRhaddQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRhaddQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vrhaddq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vrhadd\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRhaddQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRhaddQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRhaddQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = vrhaddq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vrhadd\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRhaddQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRhaddQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRhaddQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vrhaddq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vrhadd\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRhaddQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRhaddQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRhaddQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vrhaddq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vrhadd\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRhaddQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRhaddQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRhaddQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vrhaddq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vrhadd\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRhadds16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRhadds16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRhadds16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vrhadd_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vrhadd\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRhadds32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRhadds32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRhadds32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vrhadd_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vrhadd\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRhadds8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRhadds8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRhadds8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vrhadd_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vrhadd\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRhaddu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRhaddu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRhaddu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vrhadd_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vrhadd\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRhaddu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRhaddu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRhaddu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vrhadd_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vrhadd\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRhaddu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRhaddu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRhaddu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vrhadd_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vrhadd\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshlQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRshlQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshlQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vrshlq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vrshl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshlQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRshlQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshlQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vrshlq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vrshl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshlQs64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRshlQs64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshlQs64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_int64x2_t = vrshlq_s64 (arg0_int64x2_t, arg1_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "vrshl\.s64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshlQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRshlQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshlQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = vrshlq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vrshl\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshlQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRshlQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshlQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_uint16x8_t = vrshlq_u16 (arg0_uint16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vrshl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshlQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRshlQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshlQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_uint32x4_t = vrshlq_u32 (arg0_uint32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vrshl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshlQu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRshlQu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshlQu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_uint64x2_t = vrshlq_u64 (arg0_uint64x2_t, arg1_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "vrshl\.u64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshlQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRshlQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshlQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_uint8x16_t = vrshlq_u8 (arg0_uint8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vrshl\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshls16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRshls16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshls16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vrshl_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vrshl\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshls32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRshls32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshls32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vrshl_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vrshl\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshls64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRshls64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshls64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x1_t arg0_int64x1_t;
--  int64x1_t arg1_int64x1_t;
--
--  out_int64x1_t = vrshl_s64 (arg0_int64x1_t, arg1_int64x1_t);
--}
--
--/* { dg-final { scan-assembler "vrshl\.s64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshls8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRshls8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshls8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vrshl_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vrshl\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshlu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRshlu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshlu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_uint16x4_t = vrshl_u16 (arg0_uint16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vrshl\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshlu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRshlu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshlu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_uint32x2_t = vrshl_u32 (arg0_uint32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vrshl\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshlu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRshlu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshlu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--  int64x1_t arg1_int64x1_t;
--
--  out_uint64x1_t = vrshl_u64 (arg0_uint64x1_t, arg1_int64x1_t);
--}
--
--/* { dg-final { scan-assembler "vrshl\.u64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshlu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRshlu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshlu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_uint8x8_t = vrshl_u8 (arg0_uint8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vrshl\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrQ_ns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vRshrQ_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshrQ_ns16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int16x8_t = vrshrq_n_s16 (arg0_int16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrshr\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrQ_ns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vRshrQ_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshrQ_ns32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_int32x4_t = vrshrq_n_s32 (arg0_int32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrshr\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrQ_ns64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vRshrQ_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshrQ_ns64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_int64x2_t = vrshrq_n_s64 (arg0_int64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrshr\.s64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrQ_ns8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vRshrQ_ns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshrQ_ns8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_int8x16_t = vrshrq_n_s8 (arg0_int8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrshr\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrQ_nu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vRshrQ_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshrQ_nu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_uint16x8_t = vrshrq_n_u16 (arg0_uint16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrshr\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrQ_nu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vRshrQ_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshrQ_nu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_uint32x4_t = vrshrq_n_u32 (arg0_uint32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrshr\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrQ_nu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vRshrQ_nu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshrQ_nu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--
--  out_uint64x2_t = vrshrq_n_u64 (arg0_uint64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrshr\.u64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrQ_nu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vRshrQ_nu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshrQ_nu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_uint8x16_t = vrshrq_n_u8 (arg0_uint8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrshr\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshr_ns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vRshr_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshr_ns16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_int16x4_t = vrshr_n_s16 (arg0_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrshr\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshr_ns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vRshr_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshr_ns32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_int32x2_t = vrshr_n_s32 (arg0_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrshr\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshr_ns64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vRshr_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshr_ns64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x1_t arg0_int64x1_t;
--
--  out_int64x1_t = vrshr_n_s64 (arg0_int64x1_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrshr\.s64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshr_ns8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vRshr_ns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshr_ns8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_int8x8_t = vrshr_n_s8 (arg0_int8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrshr\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshr_nu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vRshr_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshr_nu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_uint16x4_t = vrshr_n_u16 (arg0_uint16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrshr\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshr_nu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vRshr_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshr_nu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_uint32x2_t = vrshr_n_u32 (arg0_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrshr\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshr_nu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vRshr_nu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshr_nu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--
--  out_uint64x1_t = vrshr_n_u64 (arg0_uint64x1_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrshr\.u64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshr_nu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vRshr_nu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshr_nu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_uint8x8_t = vrshr_n_u8 (arg0_uint8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrshr\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrn_ns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vRshrn_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshrn_ns16 (void)
--{
--  int8x8_t out_int8x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int8x8_t = vrshrn_n_s16 (arg0_int16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrshrn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrn_ns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vRshrn_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshrn_ns32 (void)
--{
--  int16x4_t out_int16x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_int16x4_t = vrshrn_n_s32 (arg0_int32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrshrn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrn_ns64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vRshrn_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshrn_ns64 (void)
--{
--  int32x2_t out_int32x2_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_int32x2_t = vrshrn_n_s64 (arg0_int64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrshrn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrn_nu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vRshrn_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshrn_nu16 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_uint8x8_t = vrshrn_n_u16 (arg0_uint16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrshrn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrn_nu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vRshrn_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshrn_nu32 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_uint16x4_t = vrshrn_n_u32 (arg0_uint32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrshrn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRshrn_nu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vRshrn_nu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRshrn_nu64 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint64x2_t arg0_uint64x2_t;
--
--  out_uint32x2_t = vrshrn_n_u64 (arg0_uint64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrshrn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRsraQ_ns16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRsraQ_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRsraQ_ns16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vrsraq_n_s16 (arg0_int16x8_t, arg1_int16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrsra\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRsraQ_ns32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRsraQ_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRsraQ_ns32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vrsraq_n_s32 (arg0_int32x4_t, arg1_int32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrsra\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRsraQ_ns64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRsraQ_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRsraQ_ns64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_int64x2_t = vrsraq_n_s64 (arg0_int64x2_t, arg1_int64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrsra\.s64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRsraQ_ns8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRsraQ_ns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRsraQ_ns8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = vrsraq_n_s8 (arg0_int8x16_t, arg1_int8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrsra\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRsraQ_nu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRsraQ_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRsraQ_nu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vrsraq_n_u16 (arg0_uint16x8_t, arg1_uint16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrsra\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRsraQ_nu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRsraQ_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRsraQ_nu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vrsraq_n_u32 (arg0_uint32x4_t, arg1_uint32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrsra\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRsraQ_nu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRsraQ_nu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRsraQ_nu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint64x2_t arg1_uint64x2_t;
--
--  out_uint64x2_t = vrsraq_n_u64 (arg0_uint64x2_t, arg1_uint64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrsra\.u64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRsraQ_nu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRsraQ_nu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRsraQ_nu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vrsraq_n_u8 (arg0_uint8x16_t, arg1_uint8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrsra\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRsra_ns16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRsra_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRsra_ns16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vrsra_n_s16 (arg0_int16x4_t, arg1_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrsra\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRsra_ns32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRsra_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRsra_ns32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vrsra_n_s32 (arg0_int32x2_t, arg1_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrsra\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRsra_ns64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRsra_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRsra_ns64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x1_t arg0_int64x1_t;
--  int64x1_t arg1_int64x1_t;
--
--  out_int64x1_t = vrsra_n_s64 (arg0_int64x1_t, arg1_int64x1_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrsra\.s64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRsra_ns8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRsra_ns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRsra_ns8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vrsra_n_s8 (arg0_int8x8_t, arg1_int8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrsra\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRsra_nu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRsra_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRsra_nu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vrsra_n_u16 (arg0_uint16x4_t, arg1_uint16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrsra\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRsra_nu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRsra_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRsra_nu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vrsra_n_u32 (arg0_uint32x2_t, arg1_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrsra\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRsra_nu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRsra_nu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRsra_nu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--  uint64x1_t arg1_uint64x1_t;
--
--  out_uint64x1_t = vrsra_n_u64 (arg0_uint64x1_t, arg1_uint64x1_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrsra\.u64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRsra_nu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRsra_nu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRsra_nu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vrsra_n_u8 (arg0_uint8x8_t, arg1_uint8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vrsra\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRsubhns16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRsubhns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRsubhns16 (void)
--{
--  int8x8_t out_int8x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int8x8_t = vrsubhn_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vrsubhn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRsubhns32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRsubhns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRsubhns32 (void)
--{
--  int16x4_t out_int16x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int16x4_t = vrsubhn_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vrsubhn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRsubhns64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRsubhns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRsubhns64 (void)
--{
--  int32x2_t out_int32x2_t;
--  int64x2_t arg0_int64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_int32x2_t = vrsubhn_s64 (arg0_int64x2_t, arg1_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "vrsubhn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRsubhnu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRsubhnu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRsubhnu16 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint8x8_t = vrsubhn_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vrsubhn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRsubhnu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRsubhnu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRsubhnu32 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint16x4_t = vrsubhn_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vrsubhn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vRsubhnu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vRsubhnu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vRsubhnu64 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint64x2_t arg1_uint64x2_t;
--
--  out_uint32x2_t = vrsubhn_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
--}
--
--/* { dg-final { scan-assembler "vrsubhn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabaQs16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vabaQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabaQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--  int16x8_t arg2_int16x8_t;
--
--  out_int16x8_t = vabaq_s16 (arg0_int16x8_t, arg1_int16x8_t, arg2_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vaba\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabaQs32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vabaQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabaQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--  int32x4_t arg2_int32x4_t;
--
--  out_int32x4_t = vabaq_s32 (arg0_int32x4_t, arg1_int32x4_t, arg2_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vaba\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabaQs8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vabaQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabaQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--  int8x16_t arg2_int8x16_t;
--
--  out_int8x16_t = vabaq_s8 (arg0_int8x16_t, arg1_int8x16_t, arg2_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vaba\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabaQu16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vabaQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabaQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--  uint16x8_t arg2_uint16x8_t;
--
--  out_uint16x8_t = vabaq_u16 (arg0_uint16x8_t, arg1_uint16x8_t, arg2_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vaba\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabaQu32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vabaQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabaQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--  uint32x4_t arg2_uint32x4_t;
--
--  out_uint32x4_t = vabaq_u32 (arg0_uint32x4_t, arg1_uint32x4_t, arg2_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vaba\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabaQu8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vabaQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabaQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--  uint8x16_t arg2_uint8x16_t;
--
--  out_uint8x16_t = vabaq_u8 (arg0_uint8x16_t, arg1_uint8x16_t, arg2_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vaba\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabals16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vabals16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabals16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int16x4_t arg1_int16x4_t;
--  int16x4_t arg2_int16x4_t;
--
--  out_int32x4_t = vabal_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vabal\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabals32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vabals32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabals32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int32x2_t arg1_int32x2_t;
--  int32x2_t arg2_int32x2_t;
--
--  out_int64x2_t = vabal_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vabal\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabals8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vabals8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabals8 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int8x8_t arg1_int8x8_t;
--  int8x8_t arg2_int8x8_t;
--
--  out_int16x8_t = vabal_s8 (arg0_int16x8_t, arg1_int8x8_t, arg2_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vabal\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabalu16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vabalu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabalu16 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint16x4_t arg1_uint16x4_t;
--  uint16x4_t arg2_uint16x4_t;
--
--  out_uint32x4_t = vabal_u16 (arg0_uint32x4_t, arg1_uint16x4_t, arg2_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vabal\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabalu32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vabalu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabalu32 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint32x2_t arg1_uint32x2_t;
--  uint32x2_t arg2_uint32x2_t;
--
--  out_uint64x2_t = vabal_u32 (arg0_uint64x2_t, arg1_uint32x2_t, arg2_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vabal\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabalu8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vabalu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabalu8 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint8x8_t arg1_uint8x8_t;
--  uint8x8_t arg2_uint8x8_t;
--
--  out_uint16x8_t = vabal_u8 (arg0_uint16x8_t, arg1_uint8x8_t, arg2_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vabal\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabas16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vabas16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabas16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--  int16x4_t arg2_int16x4_t;
--
--  out_int16x4_t = vaba_s16 (arg0_int16x4_t, arg1_int16x4_t, arg2_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vaba\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabas32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vabas32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabas32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--  int32x2_t arg2_int32x2_t;
--
--  out_int32x2_t = vaba_s32 (arg0_int32x2_t, arg1_int32x2_t, arg2_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vaba\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabas8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vabas8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabas8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--  int8x8_t arg2_int8x8_t;
--
--  out_int8x8_t = vaba_s8 (arg0_int8x8_t, arg1_int8x8_t, arg2_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vaba\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabau16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vabau16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabau16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--  uint16x4_t arg2_uint16x4_t;
--
--  out_uint16x4_t = vaba_u16 (arg0_uint16x4_t, arg1_uint16x4_t, arg2_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vaba\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabau32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vabau32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabau32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--  uint32x2_t arg2_uint32x2_t;
--
--  out_uint32x2_t = vaba_u32 (arg0_uint32x2_t, arg1_uint32x2_t, arg2_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vaba\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabau8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vabau8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabau8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--  uint8x8_t arg2_uint8x8_t;
--
--  out_uint8x8_t = vaba_u8 (arg0_uint8x8_t, arg1_uint8x8_t, arg2_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vaba\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabdQf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vabdQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabdQf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--
--  out_float32x4_t = vabdq_f32 (arg0_float32x4_t, arg1_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vabd\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabdQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vabdQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabdQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vabdq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vabd\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabdQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vabdQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabdQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vabdq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vabd\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabdQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vabdQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabdQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = vabdq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vabd\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabdQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vabdQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabdQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vabdq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vabd\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabdQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vabdQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabdQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vabdq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vabd\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabdQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vabdQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabdQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vabdq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vabd\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabdf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vabdf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabdf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_float32x2_t = vabd_f32 (arg0_float32x2_t, arg1_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vabd\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabdls16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vabdls16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabdls16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int32x4_t = vabdl_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vabdl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabdls32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vabdls32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabdls32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int64x2_t = vabdl_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vabdl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabdls8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vabdls8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabdls8 (void)
--{
--  int16x8_t out_int16x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int16x8_t = vabdl_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vabdl\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabdlu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vabdlu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabdlu16 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint32x4_t = vabdl_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vabdl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabdlu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vabdlu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabdlu32 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint64x2_t = vabdl_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vabdl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabdlu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vabdlu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabdlu8 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint16x8_t = vabdl_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vabdl\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabds16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vabds16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabds16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vabd_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vabd\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabds32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vabds32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabds32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vabd_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vabd\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabds8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vabds8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabds8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vabd_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vabd\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabdu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vabdu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabdu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vabd_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vabd\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabdu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vabdu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabdu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vabd_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vabd\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabdu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vabdu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabdu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vabd_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vabd\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabsQf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vabsQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabsQf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_float32x4_t = vabsq_f32 (arg0_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vabs\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabsQs16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vabsQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabsQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int16x8_t = vabsq_s16 (arg0_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vabs\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabsQs32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vabsQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabsQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_int32x4_t = vabsq_s32 (arg0_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vabs\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabsQs8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vabsQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabsQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_int8x16_t = vabsq_s8 (arg0_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vabs\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabsf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vabsf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabsf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_float32x2_t = vabs_f32 (arg0_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vabs\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabss16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vabss16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabss16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_int16x4_t = vabs_s16 (arg0_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vabs\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabss32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vabss32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabss32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_int32x2_t = vabs_s32 (arg0_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vabs\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vabss8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vabss8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vabss8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_int8x8_t = vabs_s8 (arg0_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vabs\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddQf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddQf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--
--  out_float32x4_t = vaddq_f32 (arg0_float32x4_t, arg1_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vadd\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vaddq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vadd\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vaddq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vadd\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddQs64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddQs64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddQs64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_int64x2_t = vaddq_s64 (arg0_int64x2_t, arg1_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "vadd\.i64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = vaddq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vadd\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vaddq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vadd\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vaddq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vadd\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddQu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddQu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddQu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint64x2_t arg1_uint64x2_t;
--
--  out_uint64x2_t = vaddq_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
--}
--
--/* { dg-final { scan-assembler "vadd\.i64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vaddq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vadd\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_float32x2_t = vadd_f32 (arg0_float32x2_t, arg1_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vadd\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddhns16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddhns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddhns16 (void)
--{
--  int8x8_t out_int8x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int8x8_t = vaddhn_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vaddhn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddhns32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddhns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddhns32 (void)
--{
--  int16x4_t out_int16x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int16x4_t = vaddhn_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vaddhn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddhns64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddhns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddhns64 (void)
--{
--  int32x2_t out_int32x2_t;
--  int64x2_t arg0_int64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_int32x2_t = vaddhn_s64 (arg0_int64x2_t, arg1_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "vaddhn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddhnu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddhnu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddhnu16 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint8x8_t = vaddhn_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vaddhn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddhnu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddhnu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddhnu32 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint16x4_t = vaddhn_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vaddhn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddhnu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddhnu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddhnu64 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint64x2_t arg1_uint64x2_t;
--
--  out_uint32x2_t = vaddhn_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
--}
--
--/* { dg-final { scan-assembler "vaddhn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddls16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddls16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddls16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int32x4_t = vaddl_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vaddl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddls32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddls32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddls32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int64x2_t = vaddl_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vaddl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddls8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddls8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddls8 (void)
--{
--  int16x8_t out_int16x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int16x8_t = vaddl_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vaddl\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddlu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddlu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddlu16 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint32x4_t = vaddl_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vaddl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddlu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddlu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddlu32 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint64x2_t = vaddl_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vaddl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddlu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddlu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddlu8 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint16x8_t = vaddl_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vaddl\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vadds16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vadds16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vadds16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vadd_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vadd\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vadds32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vadds32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vadds32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vadd_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vadd\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vadds64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vadds64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vadds64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x1_t arg0_int64x1_t;
--  int64x1_t arg1_int64x1_t;
--
--  out_int64x1_t = vadd_s64 (arg0_int64x1_t, arg1_int64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vadds8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vadds8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vadds8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vadd_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vadd\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vadd_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vadd\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vadd_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vadd\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vaddu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--  uint64x1_t arg1_uint64x1_t;
--
--  out_uint64x1_t = vadd_u64 (arg0_uint64x1_t, arg1_uint64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vadd_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vadd\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddws16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddws16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddws16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int32x4_t = vaddw_s16 (arg0_int32x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vaddw\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddws32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddws32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddws32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int64x2_t = vaddw_s32 (arg0_int64x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vaddw\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddws8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddws8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddws8 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int16x8_t = vaddw_s8 (arg0_int16x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vaddw\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddwu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddwu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddwu16 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint32x4_t = vaddw_u16 (arg0_uint32x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vaddw\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddwu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddwu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddwu32 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint64x2_t = vaddw_u32 (arg0_uint64x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vaddw\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vaddwu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vaddwu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vaddwu8 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint16x8_t = vaddw_u8 (arg0_uint16x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vaddw\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vandQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vandQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vandQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vandq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vand\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vandQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vandQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vandQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vandq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vand\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vandQs64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vandQs64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vandQs64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_int64x2_t = vandq_s64 (arg0_int64x2_t, arg1_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "vand\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vandQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vandQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vandQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = vandq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vand\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vandQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vandQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vandQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vandq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vand\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vandQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vandQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vandQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vandq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vand\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vandQu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vandQu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vandQu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint64x2_t arg1_uint64x2_t;
--
--  out_uint64x2_t = vandq_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
--}
--
--/* { dg-final { scan-assembler "vand\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vandQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vandQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vandQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vandq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vand\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vands16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vands16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vands16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vand_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vand\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vands32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vands32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vands32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vand_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vand\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vands64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vands64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vands64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x1_t arg0_int64x1_t;
--  int64x1_t arg1_int64x1_t;
--
--  out_int64x1_t = vand_s64 (arg0_int64x1_t, arg1_int64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vands8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vands8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vands8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vand_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vand\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vandu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vandu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vandu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vand_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vand\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vandu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vandu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vandu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vand_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vand\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vandu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vandu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vandu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--  uint64x1_t arg1_uint64x1_t;
--
--  out_uint64x1_t = vand_u64 (arg0_uint64x1_t, arg1_uint64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vandu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vandu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vandu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vand_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vand\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbicQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vbicQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--int16x8_t out_int16x8_t;
--int16x8_t arg0_int16x8_t;
--int16x8_t arg1_int16x8_t;
--void test_vbicQs16 (void)
--{
--
--  out_int16x8_t = vbicq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vbic\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbicQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vbicQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--int32x4_t out_int32x4_t;
--int32x4_t arg0_int32x4_t;
--int32x4_t arg1_int32x4_t;
--void test_vbicQs32 (void)
--{
--
--  out_int32x4_t = vbicq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vbic\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbicQs64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vbicQs64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--int64x2_t out_int64x2_t;
--int64x2_t arg0_int64x2_t;
--int64x2_t arg1_int64x2_t;
--void test_vbicQs64 (void)
--{
--
--  out_int64x2_t = vbicq_s64 (arg0_int64x2_t, arg1_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "vbic\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbicQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vbicQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--int8x16_t out_int8x16_t;
--int8x16_t arg0_int8x16_t;
--int8x16_t arg1_int8x16_t;
--void test_vbicQs8 (void)
--{
--
--  out_int8x16_t = vbicq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vbic\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbicQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vbicQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--uint16x8_t out_uint16x8_t;
--uint16x8_t arg0_uint16x8_t;
--uint16x8_t arg1_uint16x8_t;
--void test_vbicQu16 (void)
--{
--
--  out_uint16x8_t = vbicq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vbic\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbicQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vbicQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--uint32x4_t out_uint32x4_t;
--uint32x4_t arg0_uint32x4_t;
--uint32x4_t arg1_uint32x4_t;
--void test_vbicQu32 (void)
--{
--
--  out_uint32x4_t = vbicq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vbic\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbicQu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vbicQu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--uint64x2_t out_uint64x2_t;
--uint64x2_t arg0_uint64x2_t;
--uint64x2_t arg1_uint64x2_t;
--void test_vbicQu64 (void)
--{
--
--  out_uint64x2_t = vbicq_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
--}
--
--/* { dg-final { scan-assembler "vbic\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbicQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vbicQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--uint8x16_t out_uint8x16_t;
--uint8x16_t arg0_uint8x16_t;
--uint8x16_t arg1_uint8x16_t;
--void test_vbicQu8 (void)
--{
--
--  out_uint8x16_t = vbicq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vbic\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbics16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vbics16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--int16x4_t out_int16x4_t;
--int16x4_t arg0_int16x4_t;
--int16x4_t arg1_int16x4_t;
--void test_vbics16 (void)
--{
--
--  out_int16x4_t = vbic_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vbic\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbics32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vbics32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--int32x2_t out_int32x2_t;
--int32x2_t arg0_int32x2_t;
--int32x2_t arg1_int32x2_t;
--void test_vbics32 (void)
--{
--
--  out_int32x2_t = vbic_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vbic\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbics64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vbics64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--int64x1_t out_int64x1_t;
--int64x1_t arg0_int64x1_t;
--int64x1_t arg1_int64x1_t;
--void test_vbics64 (void)
--{
--
--  out_int64x1_t = vbic_s64 (arg0_int64x1_t, arg1_int64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbics8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vbics8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--int8x8_t out_int8x8_t;
--int8x8_t arg0_int8x8_t;
--int8x8_t arg1_int8x8_t;
--void test_vbics8 (void)
--{
--
--  out_int8x8_t = vbic_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vbic\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbicu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vbicu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--uint16x4_t out_uint16x4_t;
--uint16x4_t arg0_uint16x4_t;
--uint16x4_t arg1_uint16x4_t;
--void test_vbicu16 (void)
--{
--
--  out_uint16x4_t = vbic_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vbic\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbicu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vbicu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--uint32x2_t out_uint32x2_t;
--uint32x2_t arg0_uint32x2_t;
--uint32x2_t arg1_uint32x2_t;
--void test_vbicu32 (void)
--{
--
--  out_uint32x2_t = vbic_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vbic\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbicu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vbicu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--uint64x1_t out_uint64x1_t;
--uint64x1_t arg0_uint64x1_t;
--uint64x1_t arg1_uint64x1_t;
--void test_vbicu64 (void)
--{
--
--  out_uint64x1_t = vbic_u64 (arg0_uint64x1_t, arg1_uint64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbicu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vbicu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--uint8x8_t out_uint8x8_t;
--uint8x8_t arg0_uint8x8_t;
--uint8x8_t arg1_uint8x8_t;
--void test_vbicu8 (void)
--{
--
--  out_uint8x8_t = vbic_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vbic\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbslQf32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vbslQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vbslQf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  float32x4_t arg1_float32x4_t;
--  float32x4_t arg2_float32x4_t;
--
--  out_float32x4_t = vbslq_f32 (arg0_uint32x4_t, arg1_float32x4_t, arg2_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbslQp16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vbslQp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vbslQp16 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  poly16x8_t arg1_poly16x8_t;
--  poly16x8_t arg2_poly16x8_t;
--
--  out_poly16x8_t = vbslq_p16 (arg0_uint16x8_t, arg1_poly16x8_t, arg2_poly16x8_t);
--}
--
--/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbslQp64.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vbslQp64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vbslQp64 (void)
--{
--  poly64x2_t out_poly64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  poly64x2_t arg1_poly64x2_t;
--  poly64x2_t arg2_poly64x2_t;
--
--  out_poly64x2_t = vbslq_p64 (arg0_uint64x2_t, arg1_poly64x2_t, arg2_poly64x2_t);
--}
--
--/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbslQp8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vbslQp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vbslQp8 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  poly8x16_t arg1_poly8x16_t;
--  poly8x16_t arg2_poly8x16_t;
--
--  out_poly8x16_t = vbslq_p8 (arg0_uint8x16_t, arg1_poly8x16_t, arg2_poly8x16_t);
--}
--
--/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbslQs16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vbslQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vbslQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  int16x8_t arg1_int16x8_t;
--  int16x8_t arg2_int16x8_t;
--
--  out_int16x8_t = vbslq_s16 (arg0_uint16x8_t, arg1_int16x8_t, arg2_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbslQs32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vbslQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vbslQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  int32x4_t arg1_int32x4_t;
--  int32x4_t arg2_int32x4_t;
--
--  out_int32x4_t = vbslq_s32 (arg0_uint32x4_t, arg1_int32x4_t, arg2_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbslQs64.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vbslQs64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vbslQs64 (void)
--{
--  int64x2_t out_int64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  int64x2_t arg1_int64x2_t;
--  int64x2_t arg2_int64x2_t;
--
--  out_int64x2_t = vbslq_s64 (arg0_uint64x2_t, arg1_int64x2_t, arg2_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbslQs8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vbslQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vbslQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  int8x16_t arg1_int8x16_t;
--  int8x16_t arg2_int8x16_t;
--
--  out_int8x16_t = vbslq_s8 (arg0_uint8x16_t, arg1_int8x16_t, arg2_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbslQu16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vbslQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vbslQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--  uint16x8_t arg2_uint16x8_t;
--
--  out_uint16x8_t = vbslq_u16 (arg0_uint16x8_t, arg1_uint16x8_t, arg2_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbslQu32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vbslQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vbslQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--  uint32x4_t arg2_uint32x4_t;
--
--  out_uint32x4_t = vbslq_u32 (arg0_uint32x4_t, arg1_uint32x4_t, arg2_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbslQu64.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vbslQu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vbslQu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint64x2_t arg1_uint64x2_t;
--  uint64x2_t arg2_uint64x2_t;
--
--  out_uint64x2_t = vbslq_u64 (arg0_uint64x2_t, arg1_uint64x2_t, arg2_uint64x2_t);
--}
--
--/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbslQu8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vbslQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vbslQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--  uint8x16_t arg2_uint8x16_t;
--
--  out_uint8x16_t = vbslq_u8 (arg0_uint8x16_t, arg1_uint8x16_t, arg2_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbslf32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vbslf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vbslf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  float32x2_t arg1_float32x2_t;
--  float32x2_t arg2_float32x2_t;
--
--  out_float32x2_t = vbsl_f32 (arg0_uint32x2_t, arg1_float32x2_t, arg2_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbslp16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vbslp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vbslp16 (void)
--{
--  poly16x4_t out_poly16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  poly16x4_t arg1_poly16x4_t;
--  poly16x4_t arg2_poly16x4_t;
--
--  out_poly16x4_t = vbsl_p16 (arg0_uint16x4_t, arg1_poly16x4_t, arg2_poly16x4_t);
--}
--
--/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbslp64.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vbslp64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vbslp64 (void)
--{
--  poly64x1_t out_poly64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--  poly64x1_t arg1_poly64x1_t;
--  poly64x1_t arg2_poly64x1_t;
--
--  out_poly64x1_t = vbsl_p64 (arg0_uint64x1_t, arg1_poly64x1_t, arg2_poly64x1_t);
--}
--
--/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbslp8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vbslp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vbslp8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  poly8x8_t arg1_poly8x8_t;
--  poly8x8_t arg2_poly8x8_t;
--
--  out_poly8x8_t = vbsl_p8 (arg0_uint8x8_t, arg1_poly8x8_t, arg2_poly8x8_t);
--}
--
--/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbsls16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vbsls16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vbsls16 (void)
--{
--  int16x4_t out_int16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  int16x4_t arg1_int16x4_t;
--  int16x4_t arg2_int16x4_t;
--
--  out_int16x4_t = vbsl_s16 (arg0_uint16x4_t, arg1_int16x4_t, arg2_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbsls32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vbsls32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vbsls32 (void)
--{
--  int32x2_t out_int32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  int32x2_t arg1_int32x2_t;
--  int32x2_t arg2_int32x2_t;
--
--  out_int32x2_t = vbsl_s32 (arg0_uint32x2_t, arg1_int32x2_t, arg2_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbsls64.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vbsls64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vbsls64 (void)
--{
--  int64x1_t out_int64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--  int64x1_t arg1_int64x1_t;
--  int64x1_t arg2_int64x1_t;
--
--  out_int64x1_t = vbsl_s64 (arg0_uint64x1_t, arg1_int64x1_t, arg2_int64x1_t);
--}
--
--/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbsls8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vbsls8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vbsls8 (void)
--{
--  int8x8_t out_int8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  int8x8_t arg1_int8x8_t;
--  int8x8_t arg2_int8x8_t;
--
--  out_int8x8_t = vbsl_s8 (arg0_uint8x8_t, arg1_int8x8_t, arg2_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbslu16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vbslu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vbslu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--  uint16x4_t arg2_uint16x4_t;
--
--  out_uint16x4_t = vbsl_u16 (arg0_uint16x4_t, arg1_uint16x4_t, arg2_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbslu32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vbslu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vbslu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--  uint32x2_t arg2_uint32x2_t;
--
--  out_uint32x2_t = vbsl_u32 (arg0_uint32x2_t, arg1_uint32x2_t, arg2_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbslu64.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vbslu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vbslu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--  uint64x1_t arg1_uint64x1_t;
--  uint64x1_t arg2_uint64x1_t;
--
--  out_uint64x1_t = vbsl_u64 (arg0_uint64x1_t, arg1_uint64x1_t, arg2_uint64x1_t);
--}
--
--/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vbslu8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vbslu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vbslu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--  uint8x8_t arg2_uint8x8_t;
--
--  out_uint8x8_t = vbsl_u8 (arg0_uint8x8_t, arg1_uint8x8_t, arg2_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "((vbsl)|(vbit)|(vbif))\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcageQf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcageQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcageQf32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--
--  out_uint32x4_t = vcageq_f32 (arg0_float32x4_t, arg1_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vacge\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcagef32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcagef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcagef32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_uint32x2_t = vcage_f32 (arg0_float32x2_t, arg1_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vacge\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcagtQf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcagtQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcagtQf32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--
--  out_uint32x4_t = vcagtq_f32 (arg0_float32x4_t, arg1_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vacgt\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcagtf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcagtf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcagtf32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_uint32x2_t = vcagt_f32 (arg0_float32x2_t, arg1_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vacgt\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcaleQf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcaleQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcaleQf32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--
--  out_uint32x4_t = vcaleq_f32 (arg0_float32x4_t, arg1_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vacge\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcalef32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcalef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcalef32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_uint32x2_t = vcale_f32 (arg0_float32x2_t, arg1_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vacge\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcaltQf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcaltQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcaltQf32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--
--  out_uint32x4_t = vcaltq_f32 (arg0_float32x4_t, arg1_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vacgt\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcaltf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcaltf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcaltf32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_uint32x2_t = vcalt_f32 (arg0_float32x2_t, arg1_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vacgt\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vceqQf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vceqQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vceqQf32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--
--  out_uint32x4_t = vceqq_f32 (arg0_float32x4_t, arg1_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vceq\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vceqQp8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vceqQp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vceqQp8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  poly8x16_t arg0_poly8x16_t;
--  poly8x16_t arg1_poly8x16_t;
--
--  out_uint8x16_t = vceqq_p8 (arg0_poly8x16_t, arg1_poly8x16_t);
--}
--
--/* { dg-final { scan-assembler "vceq\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vceqQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vceqQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vceqQs16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_uint16x8_t = vceqq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vceq\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vceqQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vceqQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vceqQs32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_uint32x4_t = vceqq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vceq\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vceqQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vceqQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vceqQs8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_uint8x16_t = vceqq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vceq\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vceqQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vceqQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vceqQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vceqq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vceq\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vceqQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vceqQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vceqQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vceqq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vceq\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vceqQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vceqQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vceqQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vceqq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vceq\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vceqf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vceqf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vceqf32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_uint32x2_t = vceq_f32 (arg0_float32x2_t, arg1_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vceq\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vceqp8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vceqp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vceqp8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  poly8x8_t arg0_poly8x8_t;
--  poly8x8_t arg1_poly8x8_t;
--
--  out_uint8x8_t = vceq_p8 (arg0_poly8x8_t, arg1_poly8x8_t);
--}
--
--/* { dg-final { scan-assembler "vceq\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vceqs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vceqs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vceqs16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_uint16x4_t = vceq_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vceq\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vceqs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vceqs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vceqs32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_uint32x2_t = vceq_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vceq\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vceqs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vceqs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vceqs8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_uint8x8_t = vceq_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vceq\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcequ16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcequ16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcequ16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vceq_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vceq\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcequ32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcequ32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcequ32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vceq_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vceq\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcequ8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcequ8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcequ8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vceq_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vceq\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgeQf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgeQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgeQf32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--
--  out_uint32x4_t = vcgeq_f32 (arg0_float32x4_t, arg1_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgeQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgeQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgeQs16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_uint16x8_t = vcgeq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgeQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgeQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgeQs32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_uint32x4_t = vcgeq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgeQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgeQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgeQs8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_uint8x16_t = vcgeq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgeQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgeQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgeQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vcgeq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgeQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgeQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgeQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vcgeq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgeQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgeQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgeQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vcgeq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgef32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgef32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_uint32x2_t = vcge_f32 (arg0_float32x2_t, arg1_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcges16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcges16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcges16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_uint16x4_t = vcge_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcges32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcges32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcges32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_uint32x2_t = vcge_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcges8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcges8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcges8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_uint8x8_t = vcge_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgeu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgeu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgeu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vcge_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgeu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgeu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgeu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vcge_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgeu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgeu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgeu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vcge_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgtQf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgtQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgtQf32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--
--  out_uint32x4_t = vcgtq_f32 (arg0_float32x4_t, arg1_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgtQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgtQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgtQs16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_uint16x8_t = vcgtq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgtQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgtQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgtQs32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_uint32x4_t = vcgtq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgtQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgtQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgtQs8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_uint8x16_t = vcgtq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgtQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgtQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgtQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vcgtq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgtQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgtQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgtQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vcgtq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgtQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgtQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgtQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vcgtq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgtf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgtf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgtf32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_uint32x2_t = vcgt_f32 (arg0_float32x2_t, arg1_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgts16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgts16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgts16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_uint16x4_t = vcgt_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgts32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgts32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgts32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_uint32x2_t = vcgt_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgts8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgts8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgts8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_uint8x8_t = vcgt_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgtu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgtu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgtu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vcgt_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgtu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgtu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgtu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vcgt_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcgtu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcgtu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcgtu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vcgt_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcleQf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcleQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcleQf32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--
--  out_uint32x4_t = vcleq_f32 (arg0_float32x4_t, arg1_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcleQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcleQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcleQs16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_uint16x8_t = vcleq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcleQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcleQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcleQs32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_uint32x4_t = vcleq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcleQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcleQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcleQs8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_uint8x16_t = vcleq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcleQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcleQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcleQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vcleq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcleQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcleQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcleQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vcleq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcleQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcleQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcleQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vcleq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vclef32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vclef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vclef32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_uint32x2_t = vcle_f32 (arg0_float32x2_t, arg1_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcles16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcles16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcles16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_uint16x4_t = vcle_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcles32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcles32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcles32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_uint32x2_t = vcle_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcles8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcles8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcles8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_uint8x8_t = vcle_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcleu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcleu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcleu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vcle_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcleu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcleu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcleu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vcle_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcleu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcleu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcleu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vcle_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vcge\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vclsQs16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vclsQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vclsQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int16x8_t = vclsq_s16 (arg0_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vcls\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vclsQs32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vclsQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vclsQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_int32x4_t = vclsq_s32 (arg0_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vcls\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vclsQs8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vclsQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vclsQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_int8x16_t = vclsq_s8 (arg0_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vcls\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vclss16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vclss16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vclss16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_int16x4_t = vcls_s16 (arg0_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vcls\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vclss32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vclss32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vclss32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_int32x2_t = vcls_s32 (arg0_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vcls\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vclss8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vclss8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vclss8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_int8x8_t = vcls_s8 (arg0_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vcls\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcltQf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcltQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcltQf32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--
--  out_uint32x4_t = vcltq_f32 (arg0_float32x4_t, arg1_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcltQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcltQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcltQs16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_uint16x8_t = vcltq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcltQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcltQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcltQs32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_uint32x4_t = vcltq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcltQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcltQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcltQs8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_uint8x16_t = vcltq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcltQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcltQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcltQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vcltq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcltQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcltQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcltQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vcltq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcltQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcltQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcltQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vcltq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcltf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcltf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcltf32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_uint32x2_t = vclt_f32 (arg0_float32x2_t, arg1_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vclts16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vclts16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vclts16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_uint16x4_t = vclt_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vclts32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vclts32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vclts32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_uint32x2_t = vclt_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vclts8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vclts8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vclts8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_uint8x8_t = vclt_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcltu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcltu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcltu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vclt_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcltu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcltu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcltu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vclt_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcltu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vcltu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcltu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vclt_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vcgt\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vclzQs16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vclzQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vclzQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int16x8_t = vclzq_s16 (arg0_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vclz\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vclzQs32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vclzQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vclzQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_int32x4_t = vclzq_s32 (arg0_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vclz\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vclzQs8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vclzQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vclzQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_int8x16_t = vclzq_s8 (arg0_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vclz\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vclzQu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vclzQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vclzQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_uint16x8_t = vclzq_u16 (arg0_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vclz\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vclzQu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vclzQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vclzQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_uint32x4_t = vclzq_u32 (arg0_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vclz\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vclzQu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vclzQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vclzQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_uint8x16_t = vclzq_u8 (arg0_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vclz\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vclzs16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vclzs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vclzs16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_int16x4_t = vclz_s16 (arg0_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vclz\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vclzs32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vclzs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vclzs32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_int32x2_t = vclz_s32 (arg0_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vclz\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vclzs8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vclzs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vclzs8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_int8x8_t = vclz_s8 (arg0_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vclz\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vclzu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vclzu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vclzu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_uint16x4_t = vclz_u16 (arg0_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vclz\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vclzu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vclzu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vclzu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_uint32x2_t = vclz_u32 (arg0_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vclz\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vclzu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vclzu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vclzu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_uint8x8_t = vclz_u8 (arg0_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vclz\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcntQp8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcntQp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcntQp8 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  poly8x16_t arg0_poly8x16_t;
--
--  out_poly8x16_t = vcntq_p8 (arg0_poly8x16_t);
--}
--
--/* { dg-final { scan-assembler "vcnt\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcntQs8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcntQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcntQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_int8x16_t = vcntq_s8 (arg0_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vcnt\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcntQu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcntQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcntQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_uint8x16_t = vcntq_u8 (arg0_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vcnt\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcntp8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcntp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcntp8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly8x8_t arg0_poly8x8_t;
--
--  out_poly8x8_t = vcnt_p8 (arg0_poly8x8_t);
--}
--
--/* { dg-final { scan-assembler "vcnt\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcnts8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcnts8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcnts8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_int8x8_t = vcnt_s8 (arg0_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vcnt\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcntu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcntu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcntu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_uint8x8_t = vcnt_u8 (arg0_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vcnt\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcombinef32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcombinef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcombinef32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_float32x4_t = vcombine_f32 (arg0_float32x2_t, arg1_float32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcombinep16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcombinep16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcombinep16 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  poly16x4_t arg0_poly16x4_t;
--  poly16x4_t arg1_poly16x4_t;
--
--  out_poly16x8_t = vcombine_p16 (arg0_poly16x4_t, arg1_poly16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcombinep64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcombinep64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vcombinep64 (void)
--{
--  poly64x2_t out_poly64x2_t;
--  poly64x1_t arg0_poly64x1_t;
--  poly64x1_t arg1_poly64x1_t;
--
--  out_poly64x2_t = vcombine_p64 (arg0_poly64x1_t, arg1_poly64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcombinep8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcombinep8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcombinep8 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  poly8x8_t arg0_poly8x8_t;
--  poly8x8_t arg1_poly8x8_t;
--
--  out_poly8x16_t = vcombine_p8 (arg0_poly8x8_t, arg1_poly8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcombines16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcombines16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcombines16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x8_t = vcombine_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcombines32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcombines32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcombines32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x4_t = vcombine_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcombines64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcombines64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcombines64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x1_t arg0_int64x1_t;
--  int64x1_t arg1_int64x1_t;
--
--  out_int64x2_t = vcombine_s64 (arg0_int64x1_t, arg1_int64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcombines8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcombines8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcombines8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x16_t = vcombine_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcombineu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcombineu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcombineu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x8_t = vcombine_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcombineu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcombineu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcombineu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x4_t = vcombine_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcombineu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcombineu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcombineu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x1_t arg0_uint64x1_t;
--  uint64x1_t arg1_uint64x1_t;
--
--  out_uint64x2_t = vcombine_u64 (arg0_uint64x1_t, arg1_uint64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcombineu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcombineu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcombineu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x16_t = vcombine_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcreatef32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vcreatef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcreatef32 (void)
--{
--  float32x2_t out_float32x2_t;
--  uint64_t arg0_uint64_t;
--
--  out_float32x2_t = vcreate_f32 (arg0_uint64_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcreatep16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vcreatep16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcreatep16 (void)
--{
--  poly16x4_t out_poly16x4_t;
--  uint64_t arg0_uint64_t;
--
--  out_poly16x4_t = vcreate_p16 (arg0_uint64_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcreatep64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vcreatep64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vcreatep64 (void)
--{
--  poly64x1_t out_poly64x1_t;
--  uint64_t arg0_uint64_t;
--
--  out_poly64x1_t = vcreate_p64 (arg0_uint64_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcreatep8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vcreatep8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcreatep8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  uint64_t arg0_uint64_t;
--
--  out_poly8x8_t = vcreate_p8 (arg0_uint64_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcreates16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vcreates16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcreates16 (void)
--{
--  int16x4_t out_int16x4_t;
--  uint64_t arg0_uint64_t;
--
--  out_int16x4_t = vcreate_s16 (arg0_uint64_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcreates32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vcreates32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcreates32 (void)
--{
--  int32x2_t out_int32x2_t;
--  uint64_t arg0_uint64_t;
--
--  out_int32x2_t = vcreate_s32 (arg0_uint64_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcreates64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vcreates64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcreates64 (void)
--{
--  int64x1_t out_int64x1_t;
--  uint64_t arg0_uint64_t;
--
--  out_int64x1_t = vcreate_s64 (arg0_uint64_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcreates8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vcreates8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcreates8 (void)
--{
--  int8x8_t out_int8x8_t;
--  uint64_t arg0_uint64_t;
--
--  out_int8x8_t = vcreate_s8 (arg0_uint64_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcreateu16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vcreateu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcreateu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint64_t arg0_uint64_t;
--
--  out_uint16x4_t = vcreate_u16 (arg0_uint64_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcreateu32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vcreateu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcreateu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint64_t arg0_uint64_t;
--
--  out_uint32x2_t = vcreate_u32 (arg0_uint64_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcreateu64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vcreateu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcreateu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64_t arg0_uint64_t;
--
--  out_uint64x1_t = vcreate_u64 (arg0_uint64_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcreateu8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vcreateu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcreateu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint64_t arg0_uint64_t;
--
--  out_uint8x8_t = vcreate_u8 (arg0_uint64_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtQ_nf32_s32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcvtQ_nf32_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcvtQ_nf32_s32 (void)
--{
--  float32x4_t out_float32x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_float32x4_t = vcvtq_n_f32_s32 (arg0_int32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vcvt\.f32.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtQ_nf32_u32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcvtQ_nf32_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcvtQ_nf32_u32 (void)
--{
--  float32x4_t out_float32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_float32x4_t = vcvtq_n_f32_u32 (arg0_uint32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vcvt\.f32.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtQ_ns32_f32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcvtQ_ns32_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcvtQ_ns32_f32 (void)
--{
--  int32x4_t out_int32x4_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_int32x4_t = vcvtq_n_s32_f32 (arg0_float32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vcvt\.s32.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtQ_nu32_f32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcvtQ_nu32_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcvtQ_nu32_f32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_uint32x4_t = vcvtq_n_u32_f32 (arg0_float32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vcvt\.u32.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtQf32_s32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcvtQf32_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcvtQf32_s32 (void)
--{
--  float32x4_t out_float32x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_float32x4_t = vcvtq_f32_s32 (arg0_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vcvt\.f32.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtQf32_u32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcvtQf32_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcvtQf32_u32 (void)
--{
--  float32x4_t out_float32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_float32x4_t = vcvtq_f32_u32 (arg0_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vcvt\.f32.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtQs32_f32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcvtQs32_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcvtQs32_f32 (void)
--{
--  int32x4_t out_int32x4_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_int32x4_t = vcvtq_s32_f32 (arg0_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vcvt\.s32.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtQu32_f32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcvtQu32_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcvtQu32_f32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_uint32x4_t = vcvtq_u32_f32 (arg0_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vcvt\.u32.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcvt_nf32_s32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcvt_nf32_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcvt_nf32_s32 (void)
--{
--  float32x2_t out_float32x2_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_float32x2_t = vcvt_n_f32_s32 (arg0_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vcvt\.f32.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcvt_nf32_u32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcvt_nf32_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcvt_nf32_u32 (void)
--{
--  float32x2_t out_float32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_float32x2_t = vcvt_n_f32_u32 (arg0_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vcvt\.f32.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcvt_ns32_f32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcvt_ns32_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcvt_ns32_f32 (void)
--{
--  int32x2_t out_int32x2_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_int32x2_t = vcvt_n_s32_f32 (arg0_float32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vcvt\.s32.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcvt_nu32_f32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcvt_nu32_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcvt_nu32_f32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_uint32x2_t = vcvt_n_u32_f32 (arg0_float32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vcvt\.u32.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtf16_f32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcvtf16_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_fp16_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon_fp16 } */
--
--#include "arm_neon.h"
--
--void test_vcvtf16_f32 (void)
--{
--  float16x4_t out_float16x4_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_float16x4_t = vcvt_f16_f32 (arg0_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vcvt\.f16.f32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtf32_f16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcvtf32_f16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_fp16_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon_fp16 } */
--
--#include "arm_neon.h"
--
--void test_vcvtf32_f16 (void)
--{
--  float32x4_t out_float32x4_t;
--  float16x4_t arg0_float16x4_t;
--
--  out_float32x4_t = vcvt_f32_f16 (arg0_float16x4_t);
--}
--
--/* { dg-final { scan-assembler "vcvt\.f32.f16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtf32_s32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcvtf32_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcvtf32_s32 (void)
--{
--  float32x2_t out_float32x2_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_float32x2_t = vcvt_f32_s32 (arg0_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vcvt\.f32.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtf32_u32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcvtf32_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcvtf32_u32 (void)
--{
--  float32x2_t out_float32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_float32x2_t = vcvt_f32_u32 (arg0_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vcvt\.f32.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcvts32_f32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcvts32_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcvts32_f32 (void)
--{
--  int32x2_t out_int32x2_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_int32x2_t = vcvt_s32_f32 (arg0_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vcvt\.s32.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vcvtu32_f32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vcvtu32_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vcvtu32_f32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_uint32x2_t = vcvt_u32_f32 (arg0_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vcvt\.u32.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_lanef32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdupQ_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdupQ_lanef32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_float32x4_t = vdupq_lane_f32 (arg0_float32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_lanep16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdupQ_lanep16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdupQ_lanep16 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  poly16x4_t arg0_poly16x4_t;
--
--  out_poly16x8_t = vdupq_lane_p16 (arg0_poly16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_lanep64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vdupQ_lanep64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vdupQ_lanep64 (void)
--{
--  poly64x2_t out_poly64x2_t;
--  poly64x1_t arg0_poly64x1_t;
--
--  out_poly64x2_t = vdupq_lane_p64 (arg0_poly64x1_t, 0);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_lanep8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdupQ_lanep8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdupQ_lanep8 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  poly8x8_t arg0_poly8x8_t;
--
--  out_poly8x16_t = vdupq_lane_p8 (arg0_poly8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_lanes16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdupQ_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdupQ_lanes16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_int16x8_t = vdupq_lane_s16 (arg0_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_lanes32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdupQ_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdupQ_lanes32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_int32x4_t = vdupq_lane_s32 (arg0_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_lanes64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vdupQ_lanes64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdupQ_lanes64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x1_t arg0_int64x1_t;
--
--  out_int64x2_t = vdupq_lane_s64 (arg0_int64x1_t, 0);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_lanes8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdupQ_lanes8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdupQ_lanes8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_int8x16_t = vdupq_lane_s8 (arg0_int8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_laneu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdupQ_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdupQ_laneu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_uint16x8_t = vdupq_lane_u16 (arg0_uint16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_laneu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdupQ_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdupQ_laneu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_uint32x4_t = vdupq_lane_u32 (arg0_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_laneu64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vdupQ_laneu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdupQ_laneu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x1_t arg0_uint64x1_t;
--
--  out_uint64x2_t = vdupq_lane_u64 (arg0_uint64x1_t, 0);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_laneu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdupQ_laneu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdupQ_laneu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_uint8x16_t = vdupq_lane_u8 (arg0_uint8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_nf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdupQ_nf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdupQ_nf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32_t arg0_float32_t;
--
--  out_float32x4_t = vdupq_n_f32 (arg0_float32_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_np16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdupQ_np16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdupQ_np16 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  poly16_t arg0_poly16_t;
--
--  out_poly16x8_t = vdupq_n_p16 (arg0_poly16_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_np64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vdupQ_np64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vdupQ_np64 (void)
--{
--  poly64x2_t out_poly64x2_t;
--  poly64_t arg0_poly64_t;
--
--  out_poly64x2_t = vdupq_n_p64 (arg0_poly64_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_np8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdupQ_np8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdupQ_np8 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  poly8_t arg0_poly8_t;
--
--  out_poly8x16_t = vdupq_n_p8 (arg0_poly8_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_ns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdupQ_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdupQ_ns16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16_t arg0_int16_t;
--
--  out_int16x8_t = vdupq_n_s16 (arg0_int16_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_ns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdupQ_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdupQ_ns32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32_t arg0_int32_t;
--
--  out_int32x4_t = vdupq_n_s32 (arg0_int32_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_ns64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vdupQ_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdupQ_ns64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64_t arg0_int64_t;
--
--  out_int64x2_t = vdupq_n_s64 (arg0_int64_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_ns8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdupQ_ns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdupQ_ns8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8_t arg0_int8_t;
--
--  out_int8x16_t = vdupq_n_s8 (arg0_int8_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_nu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdupQ_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdupQ_nu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16_t arg0_uint16_t;
--
--  out_uint16x8_t = vdupq_n_u16 (arg0_uint16_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_nu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdupQ_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdupQ_nu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32_t arg0_uint32_t;
--
--  out_uint32x4_t = vdupq_n_u32 (arg0_uint32_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_nu64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vdupQ_nu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdupQ_nu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64_t arg0_uint64_t;
--
--  out_uint64x2_t = vdupq_n_u64 (arg0_uint64_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdupQ_nu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdupQ_nu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdupQ_nu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8_t arg0_uint8_t;
--
--  out_uint8x16_t = vdupq_n_u8 (arg0_uint8_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_lanef32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdup_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdup_lanef32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_float32x2_t = vdup_lane_f32 (arg0_float32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_lanep16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdup_lanep16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdup_lanep16 (void)
--{
--  poly16x4_t out_poly16x4_t;
--  poly16x4_t arg0_poly16x4_t;
--
--  out_poly16x4_t = vdup_lane_p16 (arg0_poly16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_lanep64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vdup_lanep64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vdup_lanep64 (void)
--{
--  poly64x1_t out_poly64x1_t;
--  poly64x1_t arg0_poly64x1_t;
--
--  out_poly64x1_t = vdup_lane_p64 (arg0_poly64x1_t, 0);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_lanep8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdup_lanep8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdup_lanep8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly8x8_t arg0_poly8x8_t;
--
--  out_poly8x8_t = vdup_lane_p8 (arg0_poly8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_lanes16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdup_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdup_lanes16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_int16x4_t = vdup_lane_s16 (arg0_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_lanes32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdup_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdup_lanes32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_int32x2_t = vdup_lane_s32 (arg0_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_lanes64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vdup_lanes64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdup_lanes64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x1_t arg0_int64x1_t;
--
--  out_int64x1_t = vdup_lane_s64 (arg0_int64x1_t, 0);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_lanes8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdup_lanes8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdup_lanes8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_int8x8_t = vdup_lane_s8 (arg0_int8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_laneu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdup_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdup_laneu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_uint16x4_t = vdup_lane_u16 (arg0_uint16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_laneu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdup_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdup_laneu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_uint32x2_t = vdup_lane_u32 (arg0_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_laneu64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vdup_laneu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdup_laneu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--
--  out_uint64x1_t = vdup_lane_u64 (arg0_uint64x1_t, 0);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_laneu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdup_laneu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdup_laneu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_uint8x8_t = vdup_lane_u8 (arg0_uint8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_nf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdup_nf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdup_nf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32_t arg0_float32_t;
--
--  out_float32x2_t = vdup_n_f32 (arg0_float32_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_np16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdup_np16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdup_np16 (void)
--{
--  poly16x4_t out_poly16x4_t;
--  poly16_t arg0_poly16_t;
--
--  out_poly16x4_t = vdup_n_p16 (arg0_poly16_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_np64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vdup_np64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vdup_np64 (void)
--{
--  poly64x1_t out_poly64x1_t;
--  poly64_t arg0_poly64_t;
--
--  out_poly64x1_t = vdup_n_p64 (arg0_poly64_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_np8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdup_np8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdup_np8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly8_t arg0_poly8_t;
--
--  out_poly8x8_t = vdup_n_p8 (arg0_poly8_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_ns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdup_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdup_ns16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16_t arg0_int16_t;
--
--  out_int16x4_t = vdup_n_s16 (arg0_int16_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_ns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdup_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdup_ns32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32_t arg0_int32_t;
--
--  out_int32x2_t = vdup_n_s32 (arg0_int32_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_ns64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vdup_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdup_ns64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64_t arg0_int64_t;
--
--  out_int64x1_t = vdup_n_s64 (arg0_int64_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_ns8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdup_ns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdup_ns8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8_t arg0_int8_t;
--
--  out_int8x8_t = vdup_n_s8 (arg0_int8_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_nu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdup_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdup_nu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16_t arg0_uint16_t;
--
--  out_uint16x4_t = vdup_n_u16 (arg0_uint16_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_nu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdup_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdup_nu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32_t arg0_uint32_t;
--
--  out_uint32x2_t = vdup_n_u32 (arg0_uint32_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_nu64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vdup_nu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdup_nu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64_t arg0_uint64_t;
--
--  out_uint64x1_t = vdup_n_u64 (arg0_uint64_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vdup_nu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vdup_nu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vdup_nu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8_t arg0_uint8_t;
--
--  out_uint8x8_t = vdup_n_u8 (arg0_uint8_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vect-vcvt.c
-+++ b/src//dev/null
-@@ -1,27 +0,0 @@
--/* { dg-do compile } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details -mvectorize-with-neon-double" } */
--/* { dg-add-options arm_neon } */
--
--#define N 32
--
--int ib[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
--float fa[N];
--int ia[N];
--
--int convert()
--{
--  int i;
--
--  /* int -> float */
--  for (i = 0; i < N; i++)
--    fa[i] = (float) ib[i];
--
--  /* float -> int */
--  for (i = 0; i < N; i++)
--    ia[i] = (int) fa[i];
--
--  return 0;
--}
--
--/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vect-vcvtq.c
-+++ b/src//dev/null
-@@ -1,27 +0,0 @@
--/* { dg-do compile } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details" } */
--/* { dg-add-options arm_neon } */
--
--#define N 32
--
--int ib[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
--float fa[N];
--int ia[N];
--
--int convert()
--{
--  int i;
--
--  /* int -> float */
--  for (i = 0; i < N; i++)
--    fa[i] = (float) ib[i];
--
--  /* float -> int */
--  for (i = 0; i < N; i++)
--    ia[i] = (int) fa[i];
--
--  return 0;
--}
--
--/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/veorQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `veorQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_veorQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = veorq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "veor\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/veorQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `veorQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_veorQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = veorq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "veor\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/veorQs64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `veorQs64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_veorQs64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_int64x2_t = veorq_s64 (arg0_int64x2_t, arg1_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "veor\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/veorQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `veorQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_veorQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = veorq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "veor\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/veorQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `veorQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_veorQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = veorq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "veor\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/veorQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `veorQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_veorQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = veorq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "veor\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/veorQu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `veorQu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_veorQu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint64x2_t arg1_uint64x2_t;
--
--  out_uint64x2_t = veorq_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
--}
--
--/* { dg-final { scan-assembler "veor\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/veorQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `veorQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_veorQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = veorq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "veor\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/veors16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `veors16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_veors16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = veor_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "veor\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/veors32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `veors32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_veors32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = veor_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "veor\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/veors64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `veors64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_veors64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x1_t arg0_int64x1_t;
--  int64x1_t arg1_int64x1_t;
--
--  out_int64x1_t = veor_s64 (arg0_int64x1_t, arg1_int64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/veors8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `veors8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_veors8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = veor_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "veor\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/veoru16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `veoru16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_veoru16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = veor_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "veor\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/veoru32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `veoru32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_veoru32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = veor_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "veor\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/veoru64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `veoru64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_veoru64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--  uint64x1_t arg1_uint64x1_t;
--
--  out_uint64x1_t = veor_u64 (arg0_uint64x1_t, arg1_uint64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/veoru8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `veoru8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_veoru8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = veor_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "veor\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vextQf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vextQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vextQf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--
--  out_float32x4_t = vextq_f32 (arg0_float32x4_t, arg1_float32x4_t, 0);
--}
--
--/* { dg-final { scan-assembler "vext\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vextQp16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vextQp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vextQp16 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  poly16x8_t arg0_poly16x8_t;
--  poly16x8_t arg1_poly16x8_t;
--
--  out_poly16x8_t = vextq_p16 (arg0_poly16x8_t, arg1_poly16x8_t, 0);
--}
--
--/* { dg-final { scan-assembler "vext\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vextQp64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vextQp64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vextQp64 (void)
--{
--  poly64x2_t out_poly64x2_t;
--  poly64x2_t arg0_poly64x2_t;
--  poly64x2_t arg1_poly64x2_t;
--
--  out_poly64x2_t = vextq_p64 (arg0_poly64x2_t, arg1_poly64x2_t, 0);
--}
--
--/* { dg-final { scan-assembler "vext\.64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vextQp8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vextQp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vextQp8 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  poly8x16_t arg0_poly8x16_t;
--  poly8x16_t arg1_poly8x16_t;
--
--  out_poly8x16_t = vextq_p8 (arg0_poly8x16_t, arg1_poly8x16_t, 0);
--}
--
--/* { dg-final { scan-assembler "vext\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vextQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vextQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vextQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vextq_s16 (arg0_int16x8_t, arg1_int16x8_t, 0);
--}
--
--/* { dg-final { scan-assembler "vext\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vextQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vextQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vextQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vextq_s32 (arg0_int32x4_t, arg1_int32x4_t, 0);
--}
--
--/* { dg-final { scan-assembler "vext\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vextQs64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vextQs64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vextQs64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_int64x2_t = vextq_s64 (arg0_int64x2_t, arg1_int64x2_t, 0);
--}
--
--/* { dg-final { scan-assembler "vext\.64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vextQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vextQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vextQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = vextq_s8 (arg0_int8x16_t, arg1_int8x16_t, 0);
--}
--
--/* { dg-final { scan-assembler "vext\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vextQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vextQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vextQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vextq_u16 (arg0_uint16x8_t, arg1_uint16x8_t, 0);
--}
--
--/* { dg-final { scan-assembler "vext\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vextQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vextQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vextQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vextq_u32 (arg0_uint32x4_t, arg1_uint32x4_t, 0);
--}
--
--/* { dg-final { scan-assembler "vext\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vextQu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vextQu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vextQu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint64x2_t arg1_uint64x2_t;
--
--  out_uint64x2_t = vextq_u64 (arg0_uint64x2_t, arg1_uint64x2_t, 0);
--}
--
--/* { dg-final { scan-assembler "vext\.64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vextQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vextQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vextQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vextq_u8 (arg0_uint8x16_t, arg1_uint8x16_t, 0);
--}
--
--/* { dg-final { scan-assembler "vext\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vextf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vextf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vextf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_float32x2_t = vext_f32 (arg0_float32x2_t, arg1_float32x2_t, 0);
--}
--
--/* { dg-final { scan-assembler "vext\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vextp16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vextp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vextp16 (void)
--{
--  poly16x4_t out_poly16x4_t;
--  poly16x4_t arg0_poly16x4_t;
--  poly16x4_t arg1_poly16x4_t;
--
--  out_poly16x4_t = vext_p16 (arg0_poly16x4_t, arg1_poly16x4_t, 0);
--}
--
--/* { dg-final { scan-assembler "vext\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vextp64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vextp64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vextp64 (void)
--{
--  poly64x1_t out_poly64x1_t;
--  poly64x1_t arg0_poly64x1_t;
--  poly64x1_t arg1_poly64x1_t;
--
--  out_poly64x1_t = vext_p64 (arg0_poly64x1_t, arg1_poly64x1_t, 0);
--}
--
--/* { dg-final { scan-assembler "vext\.64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vextp8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vextp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vextp8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly8x8_t arg0_poly8x8_t;
--  poly8x8_t arg1_poly8x8_t;
--
--  out_poly8x8_t = vext_p8 (arg0_poly8x8_t, arg1_poly8x8_t, 0);
--}
--
--/* { dg-final { scan-assembler "vext\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vexts16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vexts16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vexts16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vext_s16 (arg0_int16x4_t, arg1_int16x4_t, 0);
--}
--
--/* { dg-final { scan-assembler "vext\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vexts32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vexts32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vexts32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vext_s32 (arg0_int32x2_t, arg1_int32x2_t, 0);
--}
--
--/* { dg-final { scan-assembler "vext\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vexts64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vexts64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vexts64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x1_t arg0_int64x1_t;
--  int64x1_t arg1_int64x1_t;
--
--  out_int64x1_t = vext_s64 (arg0_int64x1_t, arg1_int64x1_t, 0);
--}
--
--/* { dg-final { scan-assembler "vext\.64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vexts8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vexts8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vexts8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vext_s8 (arg0_int8x8_t, arg1_int8x8_t, 0);
--}
--
--/* { dg-final { scan-assembler "vext\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vextu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vextu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vextu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vext_u16 (arg0_uint16x4_t, arg1_uint16x4_t, 0);
--}
--
--/* { dg-final { scan-assembler "vext\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vextu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vextu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vextu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vext_u32 (arg0_uint32x2_t, arg1_uint32x2_t, 0);
--}
--
--/* { dg-final { scan-assembler "vext\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vextu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vextu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vextu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--  uint64x1_t arg1_uint64x1_t;
--
--  out_uint64x1_t = vext_u64 (arg0_uint64x1_t, arg1_uint64x1_t, 0);
--}
--
--/* { dg-final { scan-assembler "vext\.64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vextu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vextu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vextu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vext_u8 (arg0_uint8x8_t, arg1_uint8x8_t, 0);
--}
--
--/* { dg-final { scan-assembler "vext\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vfmaQf32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vfmaQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neonv2_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neonv2 } */
--
--#include "arm_neon.h"
--
--void test_vfmaQf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--  float32x4_t arg2_float32x4_t;
--
--  out_float32x4_t = vfmaq_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vfma\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vfmaf32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vfmaf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neonv2_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neonv2 } */
--
--#include "arm_neon.h"
--
--void test_vfmaf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--  float32x2_t arg2_float32x2_t;
--
--  out_float32x2_t = vfma_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vfma\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vfmsQf32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vfmsQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neonv2_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neonv2 } */
--
--#include "arm_neon.h"
--
--void test_vfmsQf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--  float32x4_t arg2_float32x4_t;
--
--  out_float32x4_t = vfmsq_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vfms\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vfmsf32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vfmsf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neonv2_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neonv2 } */
--
--#include "arm_neon.h"
--
--void test_vfmsf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--  float32x2_t arg2_float32x2_t;
--
--  out_float32x2_t = vfms_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vfms\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vfp-shift-a2t2.c
-+++ b/src//dev/null
-@@ -1,27 +0,0 @@
--/* Check that NEON vector shifts support immediate values == size.  /*
--
--/* { dg-do compile } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps" } */
--/* { dg-add-options arm_neon } */
--
--#include <arm_neon.h>
--
--uint16x8_t test_vshll_n_u8 (uint8x8_t a)
--{
--    return vshll_n_u8(a, 8);
--}
--
--uint32x4_t test_vshll_n_u16 (uint16x4_t a)
--{   
--    return vshll_n_u16(a, 16);
--}
--
--uint64x2_t test_vshll_n_u32 (uint32x2_t a)
--{
--    return vshll_n_u32(a, 32);
--}
--
--/* { dg-final { scan-assembler "vshll\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vshll\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vshll\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vgetQ_lanef32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vgetQ_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vgetQ_lanef32 (void)
--{
--  float32_t out_float32_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_float32_t = vgetq_lane_f32 (arg0_float32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.32\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vgetQ_lanep16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vgetQ_lanep16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vgetQ_lanep16 (void)
--{
--  poly16_t out_poly16_t;
--  poly16x8_t arg0_poly16x8_t;
--
--  out_poly16_t = vgetq_lane_p16 (arg0_poly16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.u16\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vgetQ_lanep8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vgetQ_lanep8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vgetQ_lanep8 (void)
--{
--  poly8_t out_poly8_t;
--  poly8x16_t arg0_poly8x16_t;
--
--  out_poly8_t = vgetq_lane_p8 (arg0_poly8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.u8\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vgetQ_lanes16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vgetQ_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vgetQ_lanes16 (void)
--{
--  int16_t out_int16_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int16_t = vgetq_lane_s16 (arg0_int16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.s16\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vgetQ_lanes32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vgetQ_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vgetQ_lanes32 (void)
--{
--  int32_t out_int32_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_int32_t = vgetq_lane_s32 (arg0_int32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.32\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vgetQ_lanes64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vgetQ_lanes64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vgetQ_lanes64 (void)
--{
--  register int64_t out_int64_t asm ("r0");
--  int64x2_t arg0_int64x2_t;
--
--  out_int64_t = vgetq_lane_s64 (arg0_int64x2_t, 0);
--}
--
--/* { dg-final { scan-assembler "((vmov)|(fmrrd))\[ 	\]+\[rR\]\[0-9\]+, \[rR\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vgetQ_lanes8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vgetQ_lanes8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vgetQ_lanes8 (void)
--{
--  int8_t out_int8_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_int8_t = vgetq_lane_s8 (arg0_int8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.s8\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vgetQ_laneu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vgetQ_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vgetQ_laneu16 (void)
--{
--  uint16_t out_uint16_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_uint16_t = vgetq_lane_u16 (arg0_uint16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.u16\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vgetQ_laneu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vgetQ_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vgetQ_laneu32 (void)
--{
--  uint32_t out_uint32_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_uint32_t = vgetq_lane_u32 (arg0_uint32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.32\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vgetQ_laneu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vgetQ_laneu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vgetQ_laneu64 (void)
--{
--  register uint64_t out_uint64_t asm ("r0");
--  uint64x2_t arg0_uint64x2_t;
--
--  out_uint64_t = vgetq_lane_u64 (arg0_uint64x2_t, 0);
--}
--
--/* { dg-final { scan-assembler "((vmov)|(fmrrd))\[ 	\]+\[rR\]\[0-9\]+, \[rR\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vgetQ_laneu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vgetQ_laneu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vgetQ_laneu8 (void)
--{
--  uint8_t out_uint8_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_uint8_t = vgetq_lane_u8 (arg0_uint8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.u8\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_highf32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vget_highf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_highf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_float32x2_t = vget_high_f32 (arg0_float32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_highp16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vget_highp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_highp16 (void)
--{
--  poly16x4_t out_poly16x4_t;
--  poly16x8_t arg0_poly16x8_t;
--
--  out_poly16x4_t = vget_high_p16 (arg0_poly16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_highp64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vget_highp64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vget_highp64 (void)
--{
--  poly64x1_t out_poly64x1_t;
--  poly64x2_t arg0_poly64x2_t;
--
--  out_poly64x1_t = vget_high_p64 (arg0_poly64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_highp8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vget_highp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_highp8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly8x16_t arg0_poly8x16_t;
--
--  out_poly8x8_t = vget_high_p8 (arg0_poly8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_highs16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vget_highs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_highs16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int16x4_t = vget_high_s16 (arg0_int16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_highs32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vget_highs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_highs32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_int32x2_t = vget_high_s32 (arg0_int32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_highs64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vget_highs64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_highs64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_int64x1_t = vget_high_s64 (arg0_int64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_highs8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vget_highs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_highs8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_int8x8_t = vget_high_s8 (arg0_int8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_highu16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vget_highu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_highu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_uint16x4_t = vget_high_u16 (arg0_uint16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_highu32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vget_highu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_highu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_uint32x2_t = vget_high_u32 (arg0_uint32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_highu64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vget_highu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_highu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x2_t arg0_uint64x2_t;
--
--  out_uint64x1_t = vget_high_u64 (arg0_uint64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_highu8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vget_highu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_highu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_uint8x8_t = vget_high_u8 (arg0_uint8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lanef32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vget_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_lanef32 (void)
--{
--  float32_t out_float32_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_float32_t = vget_lane_f32 (arg0_float32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.32\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lanep16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vget_lanep16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_lanep16 (void)
--{
--  poly16_t out_poly16_t;
--  poly16x4_t arg0_poly16x4_t;
--
--  out_poly16_t = vget_lane_p16 (arg0_poly16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.u16\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lanep8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vget_lanep8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_lanep8 (void)
--{
--  poly8_t out_poly8_t;
--  poly8x8_t arg0_poly8x8_t;
--
--  out_poly8_t = vget_lane_p8 (arg0_poly8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.u8\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lanes16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vget_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_lanes16 (void)
--{
--  int16_t out_int16_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_int16_t = vget_lane_s16 (arg0_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.s16\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lanes32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vget_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_lanes32 (void)
--{
--  int32_t out_int32_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_int32_t = vget_lane_s32 (arg0_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.32\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lanes64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vget_lanes64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_lanes64 (void)
--{
--  int64_t out_int64_t;
--  int64x1_t arg0_int64x1_t;
--
--  out_int64_t = vget_lane_s64 (arg0_int64x1_t, 0);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lanes8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vget_lanes8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_lanes8 (void)
--{
--  int8_t out_int8_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_int8_t = vget_lane_s8 (arg0_int8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.s8\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_laneu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vget_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_laneu16 (void)
--{
--  uint16_t out_uint16_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_uint16_t = vget_lane_u16 (arg0_uint16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.u16\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_laneu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vget_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_laneu32 (void)
--{
--  uint32_t out_uint32_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_uint32_t = vget_lane_u32 (arg0_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.32\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_laneu64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vget_laneu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_laneu64 (void)
--{
--  uint64_t out_uint64_t;
--  uint64x1_t arg0_uint64x1_t;
--
--  out_uint64_t = vget_lane_u64 (arg0_uint64x1_t, 0);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_laneu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vget_laneu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_laneu8 (void)
--{
--  uint8_t out_uint8_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_uint8_t = vget_lane_u8 (arg0_uint8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.u8\[ 	\]+\[rR\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lowf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vget_lowf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_lowf32 (void)
--{
--  register float32x2_t out_float32x2_t asm ("d18");
--  float32x4_t arg0_float32x4_t;
--
--  out_float32x2_t = vget_low_f32 (arg0_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vmov\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lowp16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vget_lowp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_lowp16 (void)
--{
--  register poly16x4_t out_poly16x4_t asm ("d18");
--  poly16x8_t arg0_poly16x8_t;
--
--  out_poly16x4_t = vget_low_p16 (arg0_poly16x8_t);
--}
--
--/* { dg-final { scan-assembler "vmov\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lowp64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vget_lowp64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vget_lowp64 (void)
--{
--  poly64x1_t out_poly64x1_t;
--  poly64x2_t arg0_poly64x2_t;
--
--  out_poly64x1_t = vget_low_p64 (arg0_poly64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lowp8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vget_lowp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_lowp8 (void)
--{
--  register poly8x8_t out_poly8x8_t asm ("d18");
--  poly8x16_t arg0_poly8x16_t;
--
--  out_poly8x8_t = vget_low_p8 (arg0_poly8x16_t);
--}
--
--/* { dg-final { scan-assembler "vmov\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lows16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vget_lows16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_lows16 (void)
--{
--  register int16x4_t out_int16x4_t asm ("d18");
--  int16x8_t arg0_int16x8_t;
--
--  out_int16x4_t = vget_low_s16 (arg0_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vmov\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lows32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vget_lows32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_lows32 (void)
--{
--  register int32x2_t out_int32x2_t asm ("d18");
--  int32x4_t arg0_int32x4_t;
--
--  out_int32x2_t = vget_low_s32 (arg0_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vmov\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lows64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vget_lows64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_lows64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_int64x1_t = vget_low_s64 (arg0_int64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lows8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vget_lows8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_lows8 (void)
--{
--  register int8x8_t out_int8x8_t asm ("d18");
--  int8x16_t arg0_int8x16_t;
--
--  out_int8x8_t = vget_low_s8 (arg0_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vmov\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lowu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vget_lowu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_lowu16 (void)
--{
--  register uint16x4_t out_uint16x4_t asm ("d18");
--  uint16x8_t arg0_uint16x8_t;
--
--  out_uint16x4_t = vget_low_u16 (arg0_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vmov\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lowu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vget_lowu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_lowu32 (void)
--{
--  register uint32x2_t out_uint32x2_t asm ("d18");
--  uint32x4_t arg0_uint32x4_t;
--
--  out_uint32x2_t = vget_low_u32 (arg0_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vmov\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lowu64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vget_lowu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_lowu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x2_t arg0_uint64x2_t;
--
--  out_uint64x1_t = vget_low_u64 (arg0_uint64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vget_lowu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vget_lowu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vget_lowu8 (void)
--{
--  register uint8x8_t out_uint8x8_t asm ("d18");
--  uint8x16_t arg0_uint8x16_t;
--
--  out_uint8x8_t = vget_low_u8 (arg0_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vmov\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vhaddQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vhaddQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vhaddQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vhaddq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vhadd\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vhaddQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vhaddQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vhaddQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vhaddq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vhadd\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vhaddQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vhaddQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vhaddQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = vhaddq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vhadd\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vhaddQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vhaddQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vhaddQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vhaddq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vhadd\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vhaddQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vhaddQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vhaddQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vhaddq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vhadd\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vhaddQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vhaddQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vhaddQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vhaddq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vhadd\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vhadds16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vhadds16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vhadds16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vhadd_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vhadd\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vhadds32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vhadds32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vhadds32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vhadd_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vhadd\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vhadds8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vhadds8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vhadds8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vhadd_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vhadd\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vhaddu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vhaddu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vhaddu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vhadd_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vhadd\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vhaddu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vhaddu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vhaddu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vhadd_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vhadd\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vhaddu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vhaddu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vhaddu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vhadd_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vhadd\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vhsubQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vhsubQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vhsubQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vhsubq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vhsub\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vhsubQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vhsubQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vhsubQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vhsubq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vhsub\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vhsubQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vhsubQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vhsubQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = vhsubq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vhsub\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vhsubQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vhsubQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vhsubQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vhsubq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vhsub\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vhsubQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vhsubQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vhsubQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vhsubq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vhsub\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vhsubQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vhsubQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vhsubQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vhsubq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vhsub\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vhsubs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vhsubs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vhsubs16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vhsub_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vhsub\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vhsubs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vhsubs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vhsubs32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vhsub_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vhsub\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vhsubs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vhsubs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vhsubs8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vhsub_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vhsub\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vhsubu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vhsubu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vhsubu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vhsub_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vhsub\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vhsubu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vhsubu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vhsubu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vhsub_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vhsub\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vhsubu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vhsubu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vhsubu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vhsub_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vhsub\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupf32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1Q_dupf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Q_dupf32 (void)
--{
--  float32x4_t out_float32x4_t;
--
--  out_float32x4_t = vld1q_dup_f32 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupp16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1Q_dupp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Q_dupp16 (void)
--{
--  poly16x8_t out_poly16x8_t;
--
--  out_poly16x8_t = vld1q_dup_p16 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupp64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1Q_dupp64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vld1Q_dupp64 (void)
--{
--  poly64x2_t out_poly64x2_t;
--
--  out_poly64x2_t = vld1q_dup_p64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupp8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1Q_dupp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Q_dupp8 (void)
--{
--  poly8x16_t out_poly8x16_t;
--
--  out_poly8x16_t = vld1q_dup_p8 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_dups16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1Q_dups16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Q_dups16 (void)
--{
--  int16x8_t out_int16x8_t;
--
--  out_int16x8_t = vld1q_dup_s16 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_dups32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1Q_dups32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Q_dups32 (void)
--{
--  int32x4_t out_int32x4_t;
--
--  out_int32x4_t = vld1q_dup_s32 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_dups64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1Q_dups64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Q_dups64 (void)
--{
--  int64x2_t out_int64x2_t;
--
--  out_int64x2_t = vld1q_dup_s64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_dups8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1Q_dups8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Q_dups8 (void)
--{
--  int8x16_t out_int8x16_t;
--
--  out_int8x16_t = vld1q_dup_s8 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupu16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1Q_dupu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Q_dupu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--
--  out_uint16x8_t = vld1q_dup_u16 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupu32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1Q_dupu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Q_dupu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--
--  out_uint32x4_t = vld1q_dup_u32 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupu64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1Q_dupu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Q_dupu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--
--  out_uint64x2_t = vld1q_dup_u64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupu8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1Q_dupu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Q_dupu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--
--  out_uint8x16_t = vld1q_dup_u8 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_lanef32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld1Q_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Q_lanef32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--
--  out_float32x4_t = vld1q_lane_f32 (0, arg1_float32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_lanep16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld1Q_lanep16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Q_lanep16 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  poly16x8_t arg1_poly16x8_t;
--
--  out_poly16x8_t = vld1q_lane_p16 (0, arg1_poly16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_lanep64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld1Q_lanep64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vld1Q_lanep64 (void)
--{
--  poly64x2_t out_poly64x2_t;
--  poly64x2_t arg1_poly64x2_t;
--
--  out_poly64x2_t = vld1q_lane_p64 (0, arg1_poly64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_lanep8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld1Q_lanep8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Q_lanep8 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  poly8x16_t arg1_poly8x16_t;
--
--  out_poly8x16_t = vld1q_lane_p8 (0, arg1_poly8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_lanes16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld1Q_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Q_lanes16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vld1q_lane_s16 (0, arg1_int16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_lanes32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld1Q_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Q_lanes32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vld1q_lane_s32 (0, arg1_int32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_lanes64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld1Q_lanes64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Q_lanes64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_int64x2_t = vld1q_lane_s64 (0, arg1_int64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_lanes8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld1Q_lanes8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Q_lanes8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = vld1q_lane_s8 (0, arg1_int8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_laneu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld1Q_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Q_laneu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vld1q_lane_u16 (0, arg1_uint16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_laneu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld1Q_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Q_laneu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vld1q_lane_u32 (0, arg1_uint32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_laneu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld1Q_laneu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Q_laneu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg1_uint64x2_t;
--
--  out_uint64x2_t = vld1q_lane_u64 (0, arg1_uint64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Q_laneu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld1Q_laneu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Q_laneu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vld1q_lane_u8 (0, arg1_uint8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Qf32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1Qf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Qf32 (void)
--{
--  float32x4_t out_float32x4_t;
--
--  out_float32x4_t = vld1q_f32 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Qp16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1Qp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Qp16 (void)
--{
--  poly16x8_t out_poly16x8_t;
--
--  out_poly16x8_t = vld1q_p16 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Qp64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1Qp64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vld1Qp64 (void)
--{
--  poly64x2_t out_poly64x2_t;
--
--  out_poly64x2_t = vld1q_p64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Qp8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1Qp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Qp8 (void)
--{
--  poly8x16_t out_poly8x16_t;
--
--  out_poly8x16_t = vld1q_p8 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Qs16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1Qs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Qs16 (void)
--{
--  int16x8_t out_int16x8_t;
--
--  out_int16x8_t = vld1q_s16 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Qs32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1Qs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Qs32 (void)
--{
--  int32x4_t out_int32x4_t;
--
--  out_int32x4_t = vld1q_s32 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Qs64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1Qs64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Qs64 (void)
--{
--  int64x2_t out_int64x2_t;
--
--  out_int64x2_t = vld1q_s64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Qs8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1Qs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Qs8 (void)
--{
--  int8x16_t out_int8x16_t;
--
--  out_int8x16_t = vld1q_s8 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Qu16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1Qu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Qu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--
--  out_uint16x8_t = vld1q_u16 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Qu32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1Qu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Qu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--
--  out_uint32x4_t = vld1q_u32 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Qu64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1Qu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Qu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--
--  out_uint64x2_t = vld1q_u64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1Qu8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1Qu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1Qu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--
--  out_uint8x16_t = vld1q_u8 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_dupf32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1_dupf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1_dupf32 (void)
--{
--  float32x2_t out_float32x2_t;
--
--  out_float32x2_t = vld1_dup_f32 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\\\]\\\})|(\[dD\]\[0-9\]+\\\[\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_dupp16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1_dupp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1_dupp16 (void)
--{
--  poly16x4_t out_poly16x4_t;
--
--  out_poly16x4_t = vld1_dup_p16 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\\\]\\\})|(\[dD\]\[0-9\]+\\\[\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_dupp64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1_dupp64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vld1_dupp64 (void)
--{
--  poly64x1_t out_poly64x1_t;
--
--  out_poly64x1_t = vld1_dup_p64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_dupp8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1_dupp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1_dupp8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--
--  out_poly8x8_t = vld1_dup_p8 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\\\]\\\})|(\[dD\]\[0-9\]+\\\[\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_dups16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1_dups16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1_dups16 (void)
--{
--  int16x4_t out_int16x4_t;
--
--  out_int16x4_t = vld1_dup_s16 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\\\]\\\})|(\[dD\]\[0-9\]+\\\[\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_dups32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1_dups32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1_dups32 (void)
--{
--  int32x2_t out_int32x2_t;
--
--  out_int32x2_t = vld1_dup_s32 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\\\]\\\})|(\[dD\]\[0-9\]+\\\[\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_dups64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1_dups64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1_dups64 (void)
--{
--  int64x1_t out_int64x1_t;
--
--  out_int64x1_t = vld1_dup_s64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_dups8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1_dups8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1_dups8 (void)
--{
--  int8x8_t out_int8x8_t;
--
--  out_int8x8_t = vld1_dup_s8 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\\\]\\\})|(\[dD\]\[0-9\]+\\\[\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_dupu16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1_dupu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1_dupu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--
--  out_uint16x4_t = vld1_dup_u16 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\\\]\\\})|(\[dD\]\[0-9\]+\\\[\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_dupu32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1_dupu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1_dupu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--
--  out_uint32x2_t = vld1_dup_u32 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\\\]\\\})|(\[dD\]\[0-9\]+\\\[\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_dupu64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1_dupu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1_dupu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--
--  out_uint64x1_t = vld1_dup_u64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_dupu8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1_dupu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1_dupu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--
--  out_uint8x8_t = vld1_dup_u8 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\\\]\\\})|(\[dD\]\[0-9\]+\\\[\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_lanef32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld1_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1_lanef32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_float32x2_t = vld1_lane_f32 (0, arg1_float32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_lanep16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld1_lanep16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1_lanep16 (void)
--{
--  poly16x4_t out_poly16x4_t;
--  poly16x4_t arg1_poly16x4_t;
--
--  out_poly16x4_t = vld1_lane_p16 (0, arg1_poly16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_lanep64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld1_lanep64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vld1_lanep64 (void)
--{
--  poly64x1_t out_poly64x1_t;
--  poly64x1_t arg1_poly64x1_t;
--
--  out_poly64x1_t = vld1_lane_p64 (0, arg1_poly64x1_t, 0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_lanep8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld1_lanep8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1_lanep8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly8x8_t arg1_poly8x8_t;
--
--  out_poly8x8_t = vld1_lane_p8 (0, arg1_poly8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_lanes16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld1_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1_lanes16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vld1_lane_s16 (0, arg1_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_lanes32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld1_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1_lanes32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vld1_lane_s32 (0, arg1_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_lanes64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld1_lanes64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1_lanes64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x1_t arg1_int64x1_t;
--
--  out_int64x1_t = vld1_lane_s64 (0, arg1_int64x1_t, 0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_lanes8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld1_lanes8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1_lanes8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vld1_lane_s8 (0, arg1_int8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_laneu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld1_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1_laneu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vld1_lane_u16 (0, arg1_uint16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_laneu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld1_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1_laneu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vld1_lane_u32 (0, arg1_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_laneu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld1_laneu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1_laneu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x1_t arg1_uint64x1_t;
--
--  out_uint64x1_t = vld1_lane_u64 (0, arg1_uint64x1_t, 0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1_laneu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld1_laneu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1_laneu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vld1_lane_u8 (0, arg1_uint8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1f32 (void)
--{
--  float32x2_t out_float32x2_t;
--
--  out_float32x2_t = vld1_f32 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1p16 (void)
--{
--  poly16x4_t out_poly16x4_t;
--
--  out_poly16x4_t = vld1_p16 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vld1p64 (void)
--{
--  poly64x1_t out_poly64x1_t;
--
--  out_poly64x1_t = vld1_p64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1p8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--
--  out_poly8x8_t = vld1_p8 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1s16 (void)
--{
--  int16x4_t out_int16x4_t;
--
--  out_int16x4_t = vld1_s16 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1s32 (void)
--{
--  int32x2_t out_int32x2_t;
--
--  out_int32x2_t = vld1_s32 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1s64 (void)
--{
--  int64x1_t out_int64x1_t;
--
--  out_int64x1_t = vld1_s64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1s8 (void)
--{
--  int8x8_t out_int8x8_t;
--
--  out_int8x8_t = vld1_s8 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1u16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--
--  out_uint16x4_t = vld1_u16 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1u32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--
--  out_uint32x2_t = vld1_u32 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1u64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--
--  out_uint64x1_t = vld1_u64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld1u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld1u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld1u8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--
--  out_uint8x8_t = vld1_u8 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Q_lanef32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld2Q_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2Q_lanef32 (void)
--{
--  float32x4x2_t out_float32x4x2_t;
--  float32x4x2_t arg1_float32x4x2_t;
--
--  out_float32x4x2_t = vld2q_lane_f32 (0, arg1_float32x4x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Q_lanep16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld2Q_lanep16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2Q_lanep16 (void)
--{
--  poly16x8x2_t out_poly16x8x2_t;
--  poly16x8x2_t arg1_poly16x8x2_t;
--
--  out_poly16x8x2_t = vld2q_lane_p16 (0, arg1_poly16x8x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Q_lanes16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld2Q_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2Q_lanes16 (void)
--{
--  int16x8x2_t out_int16x8x2_t;
--  int16x8x2_t arg1_int16x8x2_t;
--
--  out_int16x8x2_t = vld2q_lane_s16 (0, arg1_int16x8x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Q_lanes32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld2Q_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2Q_lanes32 (void)
--{
--  int32x4x2_t out_int32x4x2_t;
--  int32x4x2_t arg1_int32x4x2_t;
--
--  out_int32x4x2_t = vld2q_lane_s32 (0, arg1_int32x4x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Q_laneu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld2Q_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2Q_laneu16 (void)
--{
--  uint16x8x2_t out_uint16x8x2_t;
--  uint16x8x2_t arg1_uint16x8x2_t;
--
--  out_uint16x8x2_t = vld2q_lane_u16 (0, arg1_uint16x8x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Q_laneu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld2Q_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2Q_laneu32 (void)
--{
--  uint32x4x2_t out_uint32x4x2_t;
--  uint32x4x2_t arg1_uint32x4x2_t;
--
--  out_uint32x4x2_t = vld2q_lane_u32 (0, arg1_uint32x4x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Qf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld2Qf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2Qf32 (void)
--{
--  float32x4x2_t out_float32x4x2_t;
--
--  out_float32x4x2_t = vld2q_f32 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Qp16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld2Qp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2Qp16 (void)
--{
--  poly16x8x2_t out_poly16x8x2_t;
--
--  out_poly16x8x2_t = vld2q_p16 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Qp8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld2Qp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2Qp8 (void)
--{
--  poly8x16x2_t out_poly8x16x2_t;
--
--  out_poly8x16x2_t = vld2q_p8 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Qs16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld2Qs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2Qs16 (void)
--{
--  int16x8x2_t out_int16x8x2_t;
--
--  out_int16x8x2_t = vld2q_s16 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Qs32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld2Qs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2Qs32 (void)
--{
--  int32x4x2_t out_int32x4x2_t;
--
--  out_int32x4x2_t = vld2q_s32 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Qs8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld2Qs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2Qs8 (void)
--{
--  int8x16x2_t out_int8x16x2_t;
--
--  out_int8x16x2_t = vld2q_s8 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Qu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld2Qu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2Qu16 (void)
--{
--  uint16x8x2_t out_uint16x8x2_t;
--
--  out_uint16x8x2_t = vld2q_u16 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Qu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld2Qu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2Qu32 (void)
--{
--  uint32x4x2_t out_uint32x4x2_t;
--
--  out_uint32x4x2_t = vld2q_u32 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2Qu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld2Qu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2Qu8 (void)
--{
--  uint8x16x2_t out_uint8x16x2_t;
--
--  out_uint8x16x2_t = vld2q_u8 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_dupf32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld2_dupf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2_dupf32 (void)
--{
--  float32x2x2_t out_float32x2x2_t;
--
--  out_float32x2x2_t = vld2_dup_f32 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_dupp16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld2_dupp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2_dupp16 (void)
--{
--  poly16x4x2_t out_poly16x4x2_t;
--
--  out_poly16x4x2_t = vld2_dup_p16 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_dupp64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld2_dupp64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vld2_dupp64 (void)
--{
--  poly64x1x2_t out_poly64x1x2_t;
--
--  out_poly64x1x2_t = vld2_dup_p64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_dupp8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld2_dupp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2_dupp8 (void)
--{
--  poly8x8x2_t out_poly8x8x2_t;
--
--  out_poly8x8x2_t = vld2_dup_p8 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_dups16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld2_dups16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2_dups16 (void)
--{
--  int16x4x2_t out_int16x4x2_t;
--
--  out_int16x4x2_t = vld2_dup_s16 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_dups32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld2_dups32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2_dups32 (void)
--{
--  int32x2x2_t out_int32x2x2_t;
--
--  out_int32x2x2_t = vld2_dup_s32 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_dups64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld2_dups64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2_dups64 (void)
--{
--  int64x1x2_t out_int64x1x2_t;
--
--  out_int64x1x2_t = vld2_dup_s64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_dups8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld2_dups8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2_dups8 (void)
--{
--  int8x8x2_t out_int8x8x2_t;
--
--  out_int8x8x2_t = vld2_dup_s8 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_dupu16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld2_dupu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2_dupu16 (void)
--{
--  uint16x4x2_t out_uint16x4x2_t;
--
--  out_uint16x4x2_t = vld2_dup_u16 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_dupu32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld2_dupu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2_dupu32 (void)
--{
--  uint32x2x2_t out_uint32x2x2_t;
--
--  out_uint32x2x2_t = vld2_dup_u32 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_dupu64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld2_dupu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2_dupu64 (void)
--{
--  uint64x1x2_t out_uint64x1x2_t;
--
--  out_uint64x1x2_t = vld2_dup_u64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_dupu8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld2_dupu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2_dupu8 (void)
--{
--  uint8x8x2_t out_uint8x8x2_t;
--
--  out_uint8x8x2_t = vld2_dup_u8 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_lanef32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld2_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2_lanef32 (void)
--{
--  float32x2x2_t out_float32x2x2_t;
--  float32x2x2_t arg1_float32x2x2_t;
--
--  out_float32x2x2_t = vld2_lane_f32 (0, arg1_float32x2x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_lanep16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld2_lanep16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2_lanep16 (void)
--{
--  poly16x4x2_t out_poly16x4x2_t;
--  poly16x4x2_t arg1_poly16x4x2_t;
--
--  out_poly16x4x2_t = vld2_lane_p16 (0, arg1_poly16x4x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_lanep8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld2_lanep8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2_lanep8 (void)
--{
--  poly8x8x2_t out_poly8x8x2_t;
--  poly8x8x2_t arg1_poly8x8x2_t;
--
--  out_poly8x8x2_t = vld2_lane_p8 (0, arg1_poly8x8x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_lanes16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld2_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2_lanes16 (void)
--{
--  int16x4x2_t out_int16x4x2_t;
--  int16x4x2_t arg1_int16x4x2_t;
--
--  out_int16x4x2_t = vld2_lane_s16 (0, arg1_int16x4x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_lanes32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld2_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2_lanes32 (void)
--{
--  int32x2x2_t out_int32x2x2_t;
--  int32x2x2_t arg1_int32x2x2_t;
--
--  out_int32x2x2_t = vld2_lane_s32 (0, arg1_int32x2x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_lanes8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld2_lanes8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2_lanes8 (void)
--{
--  int8x8x2_t out_int8x8x2_t;
--  int8x8x2_t arg1_int8x8x2_t;
--
--  out_int8x8x2_t = vld2_lane_s8 (0, arg1_int8x8x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_laneu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld2_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2_laneu16 (void)
--{
--  uint16x4x2_t out_uint16x4x2_t;
--  uint16x4x2_t arg1_uint16x4x2_t;
--
--  out_uint16x4x2_t = vld2_lane_u16 (0, arg1_uint16x4x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_laneu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld2_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2_laneu32 (void)
--{
--  uint32x2x2_t out_uint32x2x2_t;
--  uint32x2x2_t arg1_uint32x2x2_t;
--
--  out_uint32x2x2_t = vld2_lane_u32 (0, arg1_uint32x2x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2_laneu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld2_laneu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2_laneu8 (void)
--{
--  uint8x8x2_t out_uint8x8x2_t;
--  uint8x8x2_t arg1_uint8x8x2_t;
--
--  out_uint8x8x2_t = vld2_lane_u8 (0, arg1_uint8x8x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld2f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2f32 (void)
--{
--  float32x2x2_t out_float32x2x2_t;
--
--  out_float32x2x2_t = vld2_f32 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld2p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2p16 (void)
--{
--  poly16x4x2_t out_poly16x4x2_t;
--
--  out_poly16x4x2_t = vld2_p16 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld2p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vld2p64 (void)
--{
--  poly64x1x2_t out_poly64x1x2_t;
--
--  out_poly64x1x2_t = vld2_p64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld2p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2p8 (void)
--{
--  poly8x8x2_t out_poly8x8x2_t;
--
--  out_poly8x8x2_t = vld2_p8 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld2s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2s16 (void)
--{
--  int16x4x2_t out_int16x4x2_t;
--
--  out_int16x4x2_t = vld2_s16 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld2s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2s32 (void)
--{
--  int32x2x2_t out_int32x2x2_t;
--
--  out_int32x2x2_t = vld2_s32 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld2s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2s64 (void)
--{
--  int64x1x2_t out_int64x1x2_t;
--
--  out_int64x1x2_t = vld2_s64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld2s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2s8 (void)
--{
--  int8x8x2_t out_int8x8x2_t;
--
--  out_int8x8x2_t = vld2_s8 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld2u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2u16 (void)
--{
--  uint16x4x2_t out_uint16x4x2_t;
--
--  out_uint16x4x2_t = vld2_u16 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld2u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2u32 (void)
--{
--  uint32x2x2_t out_uint32x2x2_t;
--
--  out_uint32x2x2_t = vld2_u32 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld2u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2u64 (void)
--{
--  uint64x1x2_t out_uint64x1x2_t;
--
--  out_uint64x1x2_t = vld2_u64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld2u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld2u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld2u8 (void)
--{
--  uint8x8x2_t out_uint8x8x2_t;
--
--  out_uint8x8x2_t = vld2_u8 (0);
--}
--
--/* { dg-final { scan-assembler "vld2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Q_lanef32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld3Q_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3Q_lanef32 (void)
--{
--  float32x4x3_t out_float32x4x3_t;
--  float32x4x3_t arg1_float32x4x3_t;
--
--  out_float32x4x3_t = vld3q_lane_f32 (0, arg1_float32x4x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Q_lanep16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld3Q_lanep16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3Q_lanep16 (void)
--{
--  poly16x8x3_t out_poly16x8x3_t;
--  poly16x8x3_t arg1_poly16x8x3_t;
--
--  out_poly16x8x3_t = vld3q_lane_p16 (0, arg1_poly16x8x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Q_lanes16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld3Q_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3Q_lanes16 (void)
--{
--  int16x8x3_t out_int16x8x3_t;
--  int16x8x3_t arg1_int16x8x3_t;
--
--  out_int16x8x3_t = vld3q_lane_s16 (0, arg1_int16x8x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Q_lanes32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld3Q_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3Q_lanes32 (void)
--{
--  int32x4x3_t out_int32x4x3_t;
--  int32x4x3_t arg1_int32x4x3_t;
--
--  out_int32x4x3_t = vld3q_lane_s32 (0, arg1_int32x4x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Q_laneu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld3Q_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3Q_laneu16 (void)
--{
--  uint16x8x3_t out_uint16x8x3_t;
--  uint16x8x3_t arg1_uint16x8x3_t;
--
--  out_uint16x8x3_t = vld3q_lane_u16 (0, arg1_uint16x8x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Q_laneu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld3Q_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3Q_laneu32 (void)
--{
--  uint32x4x3_t out_uint32x4x3_t;
--  uint32x4x3_t arg1_uint32x4x3_t;
--
--  out_uint32x4x3_t = vld3q_lane_u32 (0, arg1_uint32x4x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Qf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld3Qf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3Qf32 (void)
--{
--  float32x4x3_t out_float32x4x3_t;
--
--  out_float32x4x3_t = vld3q_f32 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Qp16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld3Qp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3Qp16 (void)
--{
--  poly16x8x3_t out_poly16x8x3_t;
--
--  out_poly16x8x3_t = vld3q_p16 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Qp8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld3Qp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3Qp8 (void)
--{
--  poly8x16x3_t out_poly8x16x3_t;
--
--  out_poly8x16x3_t = vld3q_p8 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Qs16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld3Qs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3Qs16 (void)
--{
--  int16x8x3_t out_int16x8x3_t;
--
--  out_int16x8x3_t = vld3q_s16 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Qs32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld3Qs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3Qs32 (void)
--{
--  int32x4x3_t out_int32x4x3_t;
--
--  out_int32x4x3_t = vld3q_s32 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Qs8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld3Qs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3Qs8 (void)
--{
--  int8x16x3_t out_int8x16x3_t;
--
--  out_int8x16x3_t = vld3q_s8 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Qu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld3Qu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3Qu16 (void)
--{
--  uint16x8x3_t out_uint16x8x3_t;
--
--  out_uint16x8x3_t = vld3q_u16 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Qu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld3Qu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3Qu32 (void)
--{
--  uint32x4x3_t out_uint32x4x3_t;
--
--  out_uint32x4x3_t = vld3q_u32 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3Qu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld3Qu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3Qu8 (void)
--{
--  uint8x16x3_t out_uint8x16x3_t;
--
--  out_uint8x16x3_t = vld3q_u8 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_dupf32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld3_dupf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3_dupf32 (void)
--{
--  float32x2x3_t out_float32x2x3_t;
--
--  out_float32x2x3_t = vld3_dup_f32 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_dupp16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld3_dupp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3_dupp16 (void)
--{
--  poly16x4x3_t out_poly16x4x3_t;
--
--  out_poly16x4x3_t = vld3_dup_p16 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_dupp64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld3_dupp64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vld3_dupp64 (void)
--{
--  poly64x1x3_t out_poly64x1x3_t;
--
--  out_poly64x1x3_t = vld3_dup_p64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_dupp8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld3_dupp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3_dupp8 (void)
--{
--  poly8x8x3_t out_poly8x8x3_t;
--
--  out_poly8x8x3_t = vld3_dup_p8 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_dups16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld3_dups16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3_dups16 (void)
--{
--  int16x4x3_t out_int16x4x3_t;
--
--  out_int16x4x3_t = vld3_dup_s16 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_dups32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld3_dups32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3_dups32 (void)
--{
--  int32x2x3_t out_int32x2x3_t;
--
--  out_int32x2x3_t = vld3_dup_s32 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_dups64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld3_dups64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3_dups64 (void)
--{
--  int64x1x3_t out_int64x1x3_t;
--
--  out_int64x1x3_t = vld3_dup_s64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_dups8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld3_dups8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3_dups8 (void)
--{
--  int8x8x3_t out_int8x8x3_t;
--
--  out_int8x8x3_t = vld3_dup_s8 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_dupu16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld3_dupu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3_dupu16 (void)
--{
--  uint16x4x3_t out_uint16x4x3_t;
--
--  out_uint16x4x3_t = vld3_dup_u16 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_dupu32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld3_dupu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3_dupu32 (void)
--{
--  uint32x2x3_t out_uint32x2x3_t;
--
--  out_uint32x2x3_t = vld3_dup_u32 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_dupu64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld3_dupu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3_dupu64 (void)
--{
--  uint64x1x3_t out_uint64x1x3_t;
--
--  out_uint64x1x3_t = vld3_dup_u64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_dupu8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld3_dupu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3_dupu8 (void)
--{
--  uint8x8x3_t out_uint8x8x3_t;
--
--  out_uint8x8x3_t = vld3_dup_u8 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_lanef32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld3_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3_lanef32 (void)
--{
--  float32x2x3_t out_float32x2x3_t;
--  float32x2x3_t arg1_float32x2x3_t;
--
--  out_float32x2x3_t = vld3_lane_f32 (0, arg1_float32x2x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_lanep16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld3_lanep16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3_lanep16 (void)
--{
--  poly16x4x3_t out_poly16x4x3_t;
--  poly16x4x3_t arg1_poly16x4x3_t;
--
--  out_poly16x4x3_t = vld3_lane_p16 (0, arg1_poly16x4x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_lanep8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld3_lanep8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3_lanep8 (void)
--{
--  poly8x8x3_t out_poly8x8x3_t;
--  poly8x8x3_t arg1_poly8x8x3_t;
--
--  out_poly8x8x3_t = vld3_lane_p8 (0, arg1_poly8x8x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_lanes16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld3_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3_lanes16 (void)
--{
--  int16x4x3_t out_int16x4x3_t;
--  int16x4x3_t arg1_int16x4x3_t;
--
--  out_int16x4x3_t = vld3_lane_s16 (0, arg1_int16x4x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_lanes32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld3_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3_lanes32 (void)
--{
--  int32x2x3_t out_int32x2x3_t;
--  int32x2x3_t arg1_int32x2x3_t;
--
--  out_int32x2x3_t = vld3_lane_s32 (0, arg1_int32x2x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_lanes8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld3_lanes8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3_lanes8 (void)
--{
--  int8x8x3_t out_int8x8x3_t;
--  int8x8x3_t arg1_int8x8x3_t;
--
--  out_int8x8x3_t = vld3_lane_s8 (0, arg1_int8x8x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_laneu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld3_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3_laneu16 (void)
--{
--  uint16x4x3_t out_uint16x4x3_t;
--  uint16x4x3_t arg1_uint16x4x3_t;
--
--  out_uint16x4x3_t = vld3_lane_u16 (0, arg1_uint16x4x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_laneu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld3_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3_laneu32 (void)
--{
--  uint32x2x3_t out_uint32x2x3_t;
--  uint32x2x3_t arg1_uint32x2x3_t;
--
--  out_uint32x2x3_t = vld3_lane_u32 (0, arg1_uint32x2x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3_laneu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld3_laneu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3_laneu8 (void)
--{
--  uint8x8x3_t out_uint8x8x3_t;
--  uint8x8x3_t arg1_uint8x8x3_t;
--
--  out_uint8x8x3_t = vld3_lane_u8 (0, arg1_uint8x8x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld3f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3f32 (void)
--{
--  float32x2x3_t out_float32x2x3_t;
--
--  out_float32x2x3_t = vld3_f32 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld3p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3p16 (void)
--{
--  poly16x4x3_t out_poly16x4x3_t;
--
--  out_poly16x4x3_t = vld3_p16 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld3p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vld3p64 (void)
--{
--  poly64x1x3_t out_poly64x1x3_t;
--
--  out_poly64x1x3_t = vld3_p64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld3p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3p8 (void)
--{
--  poly8x8x3_t out_poly8x8x3_t;
--
--  out_poly8x8x3_t = vld3_p8 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld3s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3s16 (void)
--{
--  int16x4x3_t out_int16x4x3_t;
--
--  out_int16x4x3_t = vld3_s16 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld3s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3s32 (void)
--{
--  int32x2x3_t out_int32x2x3_t;
--
--  out_int32x2x3_t = vld3_s32 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld3s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3s64 (void)
--{
--  int64x1x3_t out_int64x1x3_t;
--
--  out_int64x1x3_t = vld3_s64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld3s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3s8 (void)
--{
--  int8x8x3_t out_int8x8x3_t;
--
--  out_int8x8x3_t = vld3_s8 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld3u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3u16 (void)
--{
--  uint16x4x3_t out_uint16x4x3_t;
--
--  out_uint16x4x3_t = vld3_u16 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld3u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3u32 (void)
--{
--  uint32x2x3_t out_uint32x2x3_t;
--
--  out_uint32x2x3_t = vld3_u32 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld3u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3u64 (void)
--{
--  uint64x1x3_t out_uint64x1x3_t;
--
--  out_uint64x1x3_t = vld3_u64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld3u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld3u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld3u8 (void)
--{
--  uint8x8x3_t out_uint8x8x3_t;
--
--  out_uint8x8x3_t = vld3_u8 (0);
--}
--
--/* { dg-final { scan-assembler "vld3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Q_lanef32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld4Q_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4Q_lanef32 (void)
--{
--  float32x4x4_t out_float32x4x4_t;
--  float32x4x4_t arg1_float32x4x4_t;
--
--  out_float32x4x4_t = vld4q_lane_f32 (0, arg1_float32x4x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Q_lanep16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld4Q_lanep16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4Q_lanep16 (void)
--{
--  poly16x8x4_t out_poly16x8x4_t;
--  poly16x8x4_t arg1_poly16x8x4_t;
--
--  out_poly16x8x4_t = vld4q_lane_p16 (0, arg1_poly16x8x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Q_lanes16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld4Q_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4Q_lanes16 (void)
--{
--  int16x8x4_t out_int16x8x4_t;
--  int16x8x4_t arg1_int16x8x4_t;
--
--  out_int16x8x4_t = vld4q_lane_s16 (0, arg1_int16x8x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Q_lanes32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld4Q_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4Q_lanes32 (void)
--{
--  int32x4x4_t out_int32x4x4_t;
--  int32x4x4_t arg1_int32x4x4_t;
--
--  out_int32x4x4_t = vld4q_lane_s32 (0, arg1_int32x4x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Q_laneu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld4Q_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4Q_laneu16 (void)
--{
--  uint16x8x4_t out_uint16x8x4_t;
--  uint16x8x4_t arg1_uint16x8x4_t;
--
--  out_uint16x8x4_t = vld4q_lane_u16 (0, arg1_uint16x8x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Q_laneu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld4Q_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4Q_laneu32 (void)
--{
--  uint32x4x4_t out_uint32x4x4_t;
--  uint32x4x4_t arg1_uint32x4x4_t;
--
--  out_uint32x4x4_t = vld4q_lane_u32 (0, arg1_uint32x4x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Qf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld4Qf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4Qf32 (void)
--{
--  float32x4x4_t out_float32x4x4_t;
--
--  out_float32x4x4_t = vld4q_f32 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Qp16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld4Qp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4Qp16 (void)
--{
--  poly16x8x4_t out_poly16x8x4_t;
--
--  out_poly16x8x4_t = vld4q_p16 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Qp8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld4Qp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4Qp8 (void)
--{
--  poly8x16x4_t out_poly8x16x4_t;
--
--  out_poly8x16x4_t = vld4q_p8 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Qs16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld4Qs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4Qs16 (void)
--{
--  int16x8x4_t out_int16x8x4_t;
--
--  out_int16x8x4_t = vld4q_s16 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Qs32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld4Qs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4Qs32 (void)
--{
--  int32x4x4_t out_int32x4x4_t;
--
--  out_int32x4x4_t = vld4q_s32 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Qs8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld4Qs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4Qs8 (void)
--{
--  int8x16x4_t out_int8x16x4_t;
--
--  out_int8x16x4_t = vld4q_s8 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Qu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld4Qu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4Qu16 (void)
--{
--  uint16x8x4_t out_uint16x8x4_t;
--
--  out_uint16x8x4_t = vld4q_u16 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Qu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld4Qu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4Qu32 (void)
--{
--  uint32x4x4_t out_uint32x4x4_t;
--
--  out_uint32x4x4_t = vld4q_u32 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4Qu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld4Qu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4Qu8 (void)
--{
--  uint8x16x4_t out_uint8x16x4_t;
--
--  out_uint8x16x4_t = vld4q_u8 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_dupf32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld4_dupf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4_dupf32 (void)
--{
--  float32x2x4_t out_float32x2x4_t;
--
--  out_float32x2x4_t = vld4_dup_f32 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_dupp16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld4_dupp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4_dupp16 (void)
--{
--  poly16x4x4_t out_poly16x4x4_t;
--
--  out_poly16x4x4_t = vld4_dup_p16 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_dupp64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld4_dupp64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vld4_dupp64 (void)
--{
--  poly64x1x4_t out_poly64x1x4_t;
--
--  out_poly64x1x4_t = vld4_dup_p64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_dupp8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld4_dupp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4_dupp8 (void)
--{
--  poly8x8x4_t out_poly8x8x4_t;
--
--  out_poly8x8x4_t = vld4_dup_p8 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_dups16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld4_dups16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4_dups16 (void)
--{
--  int16x4x4_t out_int16x4x4_t;
--
--  out_int16x4x4_t = vld4_dup_s16 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_dups32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld4_dups32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4_dups32 (void)
--{
--  int32x2x4_t out_int32x2x4_t;
--
--  out_int32x2x4_t = vld4_dup_s32 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_dups64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld4_dups64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4_dups64 (void)
--{
--  int64x1x4_t out_int64x1x4_t;
--
--  out_int64x1x4_t = vld4_dup_s64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_dups8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld4_dups8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4_dups8 (void)
--{
--  int8x8x4_t out_int8x8x4_t;
--
--  out_int8x8x4_t = vld4_dup_s8 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_dupu16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld4_dupu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4_dupu16 (void)
--{
--  uint16x4x4_t out_uint16x4x4_t;
--
--  out_uint16x4x4_t = vld4_dup_u16 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_dupu32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld4_dupu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4_dupu32 (void)
--{
--  uint32x2x4_t out_uint32x2x4_t;
--
--  out_uint32x2x4_t = vld4_dup_u32 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_dupu64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld4_dupu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4_dupu64 (void)
--{
--  uint64x1x4_t out_uint64x1x4_t;
--
--  out_uint64x1x4_t = vld4_dup_u64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_dupu8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld4_dupu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4_dupu8 (void)
--{
--  uint8x8x4_t out_uint8x8x4_t;
--
--  out_uint8x8x4_t = vld4_dup_u8 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\\\]-\[dD\]\[0-9\]+\\\[\\\])|(\[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\], \[dD\]\[0-9\]+\\\[\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_lanef32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld4_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4_lanef32 (void)
--{
--  float32x2x4_t out_float32x2x4_t;
--  float32x2x4_t arg1_float32x2x4_t;
--
--  out_float32x2x4_t = vld4_lane_f32 (0, arg1_float32x2x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_lanep16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld4_lanep16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4_lanep16 (void)
--{
--  poly16x4x4_t out_poly16x4x4_t;
--  poly16x4x4_t arg1_poly16x4x4_t;
--
--  out_poly16x4x4_t = vld4_lane_p16 (0, arg1_poly16x4x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_lanep8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld4_lanep8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4_lanep8 (void)
--{
--  poly8x8x4_t out_poly8x8x4_t;
--  poly8x8x4_t arg1_poly8x8x4_t;
--
--  out_poly8x8x4_t = vld4_lane_p8 (0, arg1_poly8x8x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_lanes16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld4_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4_lanes16 (void)
--{
--  int16x4x4_t out_int16x4x4_t;
--  int16x4x4_t arg1_int16x4x4_t;
--
--  out_int16x4x4_t = vld4_lane_s16 (0, arg1_int16x4x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_lanes32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld4_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4_lanes32 (void)
--{
--  int32x2x4_t out_int32x2x4_t;
--  int32x2x4_t arg1_int32x2x4_t;
--
--  out_int32x2x4_t = vld4_lane_s32 (0, arg1_int32x2x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_lanes8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld4_lanes8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4_lanes8 (void)
--{
--  int8x8x4_t out_int8x8x4_t;
--  int8x8x4_t arg1_int8x8x4_t;
--
--  out_int8x8x4_t = vld4_lane_s8 (0, arg1_int8x8x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_laneu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld4_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4_laneu16 (void)
--{
--  uint16x4x4_t out_uint16x4x4_t;
--  uint16x4x4_t arg1_uint16x4x4_t;
--
--  out_uint16x4x4_t = vld4_lane_u16 (0, arg1_uint16x4x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_laneu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld4_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4_laneu32 (void)
--{
--  uint32x2x4_t out_uint32x2x4_t;
--  uint32x2x4_t arg1_uint32x2x4_t;
--
--  out_uint32x2x4_t = vld4_lane_u32 (0, arg1_uint32x2x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4_laneu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vld4_laneu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4_laneu8 (void)
--{
--  uint8x8x4_t out_uint8x8x4_t;
--  uint8x8x4_t arg1_uint8x8x4_t;
--
--  out_uint8x8x4_t = vld4_lane_u8 (0, arg1_uint8x8x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld4f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4f32 (void)
--{
--  float32x2x4_t out_float32x2x4_t;
--
--  out_float32x2x4_t = vld4_f32 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld4p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4p16 (void)
--{
--  poly16x4x4_t out_poly16x4x4_t;
--
--  out_poly16x4x4_t = vld4_p16 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld4p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vld4p64 (void)
--{
--  poly64x1x4_t out_poly64x1x4_t;
--
--  out_poly64x1x4_t = vld4_p64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld4p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4p8 (void)
--{
--  poly8x8x4_t out_poly8x8x4_t;
--
--  out_poly8x8x4_t = vld4_p8 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld4s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4s16 (void)
--{
--  int16x4x4_t out_int16x4x4_t;
--
--  out_int16x4x4_t = vld4_s16 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld4s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4s32 (void)
--{
--  int32x2x4_t out_int32x2x4_t;
--
--  out_int32x2x4_t = vld4_s32 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld4s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4s64 (void)
--{
--  int64x1x4_t out_int64x1x4_t;
--
--  out_int64x1x4_t = vld4_s64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld4s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4s8 (void)
--{
--  int8x8x4_t out_int8x8x4_t;
--
--  out_int8x8x4_t = vld4_s8 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld4u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4u16 (void)
--{
--  uint16x4x4_t out_uint16x4x4_t;
--
--  out_uint16x4x4_t = vld4_u16 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld4u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4u32 (void)
--{
--  uint32x2x4_t out_uint32x2x4_t;
--
--  out_uint32x2x4_t = vld4_u32 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld4u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4u64 (void)
--{
--  uint64x1x4_t out_uint64x1x4_t;
--
--  out_uint64x1x4_t = vld4_u64 (0);
--}
--
--/* { dg-final { scan-assembler "vld1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vld4u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vld4u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vld4u8 (void)
--{
--  uint8x8x4_t out_uint8x8x4_t;
--
--  out_uint8x8x4_t = vld4_u8 (0);
--}
--
--/* { dg-final { scan-assembler "vld4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxQf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmaxQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmaxQf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--
--  out_float32x4_t = vmaxq_f32 (arg0_float32x4_t, arg1_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vmax\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmaxQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmaxQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vmaxq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vmax\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmaxQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmaxQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vmaxq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vmax\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmaxQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmaxQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = vmaxq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vmax\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmaxQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmaxQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vmaxq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vmax\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmaxQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmaxQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vmaxq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vmax\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmaxQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmaxQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vmaxq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vmax\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmaxf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmaxf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_float32x2_t = vmax_f32 (arg0_float32x2_t, arg1_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmax\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmaxs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmaxs16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vmax_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vmax\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmaxs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmaxs32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vmax_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmax\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmaxs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmaxs8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vmax_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vmax\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmaxu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmaxu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vmax_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vmax\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmaxu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmaxu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vmax_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmax\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmaxu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmaxu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmaxu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vmax_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vmax\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vminQf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vminQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vminQf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--
--  out_float32x4_t = vminq_f32 (arg0_float32x4_t, arg1_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vmin\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vminQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vminQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vminQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vminq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vmin\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vminQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vminQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vminQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vminq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vmin\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vminQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vminQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vminQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = vminq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vmin\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vminQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vminQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vminQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vminq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vmin\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vminQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vminQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vminQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vminq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vmin\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vminQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vminQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vminQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vminq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vmin\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vminf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vminf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vminf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_float32x2_t = vmin_f32 (arg0_float32x2_t, arg1_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmin\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmins16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmins16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmins16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vmin_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vmin\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmins32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmins32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmins32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vmin_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmin\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmins8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmins8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmins8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vmin_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vmin\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vminu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vminu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vminu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vmin_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vmin\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vminu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vminu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vminu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vmin_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmin\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vminu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vminu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vminu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vmin_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vmin\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQ_lanef32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlaQ_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlaQ_lanef32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--  float32x2_t arg2_float32x2_t;
--
--  out_float32x4_t = vmlaq_lane_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmla\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQ_lanes16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlaQ_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlaQ_lanes16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--  int16x4_t arg2_int16x4_t;
--
--  out_int16x8_t = vmlaq_lane_s16 (arg0_int16x8_t, arg1_int16x8_t, arg2_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmla\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQ_lanes32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlaQ_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlaQ_lanes32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--  int32x2_t arg2_int32x2_t;
--
--  out_int32x4_t = vmlaq_lane_s32 (arg0_int32x4_t, arg1_int32x4_t, arg2_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmla\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQ_laneu16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlaQ_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlaQ_laneu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--  uint16x4_t arg2_uint16x4_t;
--
--  out_uint16x8_t = vmlaq_lane_u16 (arg0_uint16x8_t, arg1_uint16x8_t, arg2_uint16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmla\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQ_laneu32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlaQ_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlaQ_laneu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--  uint32x2_t arg2_uint32x2_t;
--
--  out_uint32x4_t = vmlaq_lane_u32 (arg0_uint32x4_t, arg1_uint32x4_t, arg2_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmla\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQ_nf32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlaQ_nf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlaQ_nf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--  float32_t arg2_float32_t;
--
--  out_float32x4_t = vmlaq_n_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32_t);
--}
--
--/* { dg-final { scan-assembler "vmla\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQ_ns16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlaQ_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlaQ_ns16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--  int16_t arg2_int16_t;
--
--  out_int16x8_t = vmlaq_n_s16 (arg0_int16x8_t, arg1_int16x8_t, arg2_int16_t);
--}
--
--/* { dg-final { scan-assembler "vmla\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQ_ns32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlaQ_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlaQ_ns32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--  int32_t arg2_int32_t;
--
--  out_int32x4_t = vmlaq_n_s32 (arg0_int32x4_t, arg1_int32x4_t, arg2_int32_t);
--}
--
--/* { dg-final { scan-assembler "vmla\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQ_nu16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlaQ_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlaQ_nu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--  uint16_t arg2_uint16_t;
--
--  out_uint16x8_t = vmlaq_n_u16 (arg0_uint16x8_t, arg1_uint16x8_t, arg2_uint16_t);
--}
--
--/* { dg-final { scan-assembler "vmla\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQ_nu32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlaQ_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlaQ_nu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--  uint32_t arg2_uint32_t;
--
--  out_uint32x4_t = vmlaq_n_u32 (arg0_uint32x4_t, arg1_uint32x4_t, arg2_uint32_t);
--}
--
--/* { dg-final { scan-assembler "vmla\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQf32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlaQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlaQf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--  float32x4_t arg2_float32x4_t;
--
--  out_float32x4_t = vmlaq_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vmla\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQs16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlaQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlaQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--  int16x8_t arg2_int16x8_t;
--
--  out_int16x8_t = vmlaq_s16 (arg0_int16x8_t, arg1_int16x8_t, arg2_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vmla\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQs32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlaQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlaQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--  int32x4_t arg2_int32x4_t;
--
--  out_int32x4_t = vmlaq_s32 (arg0_int32x4_t, arg1_int32x4_t, arg2_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vmla\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQs8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlaQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlaQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--  int8x16_t arg2_int8x16_t;
--
--  out_int8x16_t = vmlaq_s8 (arg0_int8x16_t, arg1_int8x16_t, arg2_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vmla\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQu16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlaQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlaQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--  uint16x8_t arg2_uint16x8_t;
--
--  out_uint16x8_t = vmlaq_u16 (arg0_uint16x8_t, arg1_uint16x8_t, arg2_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vmla\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQu32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlaQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlaQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--  uint32x4_t arg2_uint32x4_t;
--
--  out_uint32x4_t = vmlaq_u32 (arg0_uint32x4_t, arg1_uint32x4_t, arg2_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vmla\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaQu8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlaQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlaQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--  uint8x16_t arg2_uint8x16_t;
--
--  out_uint8x16_t = vmlaq_u8 (arg0_uint8x16_t, arg1_uint8x16_t, arg2_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vmla\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmla_lanef32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmla_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmla_lanef32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--  float32x2_t arg2_float32x2_t;
--
--  out_float32x2_t = vmla_lane_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmla\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmla_lanes16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmla_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmla_lanes16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--  int16x4_t arg2_int16x4_t;
--
--  out_int16x4_t = vmla_lane_s16 (arg0_int16x4_t, arg1_int16x4_t, arg2_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmla\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmla_lanes32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmla_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmla_lanes32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--  int32x2_t arg2_int32x2_t;
--
--  out_int32x2_t = vmla_lane_s32 (arg0_int32x2_t, arg1_int32x2_t, arg2_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmla\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmla_laneu16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmla_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmla_laneu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--  uint16x4_t arg2_uint16x4_t;
--
--  out_uint16x4_t = vmla_lane_u16 (arg0_uint16x4_t, arg1_uint16x4_t, arg2_uint16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmla\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmla_laneu32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmla_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmla_laneu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--  uint32x2_t arg2_uint32x2_t;
--
--  out_uint32x2_t = vmla_lane_u32 (arg0_uint32x2_t, arg1_uint32x2_t, arg2_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmla\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmla_nf32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmla_nf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmla_nf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--  float32_t arg2_float32_t;
--
--  out_float32x2_t = vmla_n_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32_t);
--}
--
--/* { dg-final { scan-assembler "vmla\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmla_ns16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmla_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmla_ns16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--  int16_t arg2_int16_t;
--
--  out_int16x4_t = vmla_n_s16 (arg0_int16x4_t, arg1_int16x4_t, arg2_int16_t);
--}
--
--/* { dg-final { scan-assembler "vmla\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmla_ns32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmla_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmla_ns32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--  int32_t arg2_int32_t;
--
--  out_int32x2_t = vmla_n_s32 (arg0_int32x2_t, arg1_int32x2_t, arg2_int32_t);
--}
--
--/* { dg-final { scan-assembler "vmla\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmla_nu16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmla_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmla_nu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--  uint16_t arg2_uint16_t;
--
--  out_uint16x4_t = vmla_n_u16 (arg0_uint16x4_t, arg1_uint16x4_t, arg2_uint16_t);
--}
--
--/* { dg-final { scan-assembler "vmla\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmla_nu32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmla_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmla_nu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--  uint32_t arg2_uint32_t;
--
--  out_uint32x2_t = vmla_n_u32 (arg0_uint32x2_t, arg1_uint32x2_t, arg2_uint32_t);
--}
--
--/* { dg-final { scan-assembler "vmla\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlaf32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlaf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlaf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--  float32x2_t arg2_float32x2_t;
--
--  out_float32x2_t = vmla_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmla\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlal_lanes16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlal_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlal_lanes16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int16x4_t arg1_int16x4_t;
--  int16x4_t arg2_int16x4_t;
--
--  out_int32x4_t = vmlal_lane_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmlal\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlal_lanes32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlal_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlal_lanes32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int32x2_t arg1_int32x2_t;
--  int32x2_t arg2_int32x2_t;
--
--  out_int64x2_t = vmlal_lane_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmlal\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlal_laneu16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlal_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlal_laneu16 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint16x4_t arg1_uint16x4_t;
--  uint16x4_t arg2_uint16x4_t;
--
--  out_uint32x4_t = vmlal_lane_u16 (arg0_uint32x4_t, arg1_uint16x4_t, arg2_uint16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmlal\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlal_laneu32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlal_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlal_laneu32 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint32x2_t arg1_uint32x2_t;
--  uint32x2_t arg2_uint32x2_t;
--
--  out_uint64x2_t = vmlal_lane_u32 (arg0_uint64x2_t, arg1_uint32x2_t, arg2_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmlal\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlal_ns16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlal_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlal_ns16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int16x4_t arg1_int16x4_t;
--  int16_t arg2_int16_t;
--
--  out_int32x4_t = vmlal_n_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16_t);
--}
--
--/* { dg-final { scan-assembler "vmlal\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlal_ns32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlal_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlal_ns32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int32x2_t arg1_int32x2_t;
--  int32_t arg2_int32_t;
--
--  out_int64x2_t = vmlal_n_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32_t);
--}
--
--/* { dg-final { scan-assembler "vmlal\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlal_nu16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlal_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlal_nu16 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint16x4_t arg1_uint16x4_t;
--  uint16_t arg2_uint16_t;
--
--  out_uint32x4_t = vmlal_n_u16 (arg0_uint32x4_t, arg1_uint16x4_t, arg2_uint16_t);
--}
--
--/* { dg-final { scan-assembler "vmlal\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlal_nu32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlal_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlal_nu32 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint32x2_t arg1_uint32x2_t;
--  uint32_t arg2_uint32_t;
--
--  out_uint64x2_t = vmlal_n_u32 (arg0_uint64x2_t, arg1_uint32x2_t, arg2_uint32_t);
--}
--
--/* { dg-final { scan-assembler "vmlal\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlals16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlals16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlals16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int16x4_t arg1_int16x4_t;
--  int16x4_t arg2_int16x4_t;
--
--  out_int32x4_t = vmlal_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vmlal\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlals32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlals32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlals32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int32x2_t arg1_int32x2_t;
--  int32x2_t arg2_int32x2_t;
--
--  out_int64x2_t = vmlal_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmlal\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlals8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlals8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlals8 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int8x8_t arg1_int8x8_t;
--  int8x8_t arg2_int8x8_t;
--
--  out_int16x8_t = vmlal_s8 (arg0_int16x8_t, arg1_int8x8_t, arg2_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vmlal\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlalu16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlalu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlalu16 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint16x4_t arg1_uint16x4_t;
--  uint16x4_t arg2_uint16x4_t;
--
--  out_uint32x4_t = vmlal_u16 (arg0_uint32x4_t, arg1_uint16x4_t, arg2_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vmlal\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlalu32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlalu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlalu32 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint32x2_t arg1_uint32x2_t;
--  uint32x2_t arg2_uint32x2_t;
--
--  out_uint64x2_t = vmlal_u32 (arg0_uint64x2_t, arg1_uint32x2_t, arg2_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmlal\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlalu8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlalu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlalu8 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint8x8_t arg1_uint8x8_t;
--  uint8x8_t arg2_uint8x8_t;
--
--  out_uint16x8_t = vmlal_u8 (arg0_uint16x8_t, arg1_uint8x8_t, arg2_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vmlal\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlas16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlas16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlas16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--  int16x4_t arg2_int16x4_t;
--
--  out_int16x4_t = vmla_s16 (arg0_int16x4_t, arg1_int16x4_t, arg2_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vmla\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlas32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlas32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlas32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--  int32x2_t arg2_int32x2_t;
--
--  out_int32x2_t = vmla_s32 (arg0_int32x2_t, arg1_int32x2_t, arg2_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmla\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlas8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlas8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlas8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--  int8x8_t arg2_int8x8_t;
--
--  out_int8x8_t = vmla_s8 (arg0_int8x8_t, arg1_int8x8_t, arg2_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vmla\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlau16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlau16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlau16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--  uint16x4_t arg2_uint16x4_t;
--
--  out_uint16x4_t = vmla_u16 (arg0_uint16x4_t, arg1_uint16x4_t, arg2_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vmla\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlau32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlau32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlau32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--  uint32x2_t arg2_uint32x2_t;
--
--  out_uint32x2_t = vmla_u32 (arg0_uint32x2_t, arg1_uint32x2_t, arg2_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmla\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlau8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlau8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlau8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--  uint8x8_t arg2_uint8x8_t;
--
--  out_uint8x8_t = vmla_u8 (arg0_uint8x8_t, arg1_uint8x8_t, arg2_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vmla\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQ_lanef32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsQ_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsQ_lanef32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--  float32x2_t arg2_float32x2_t;
--
--  out_float32x4_t = vmlsq_lane_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmls\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQ_lanes16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsQ_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsQ_lanes16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--  int16x4_t arg2_int16x4_t;
--
--  out_int16x8_t = vmlsq_lane_s16 (arg0_int16x8_t, arg1_int16x8_t, arg2_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmls\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQ_lanes32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsQ_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsQ_lanes32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--  int32x2_t arg2_int32x2_t;
--
--  out_int32x4_t = vmlsq_lane_s32 (arg0_int32x4_t, arg1_int32x4_t, arg2_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmls\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQ_laneu16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsQ_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsQ_laneu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--  uint16x4_t arg2_uint16x4_t;
--
--  out_uint16x8_t = vmlsq_lane_u16 (arg0_uint16x8_t, arg1_uint16x8_t, arg2_uint16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmls\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQ_laneu32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsQ_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsQ_laneu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--  uint32x2_t arg2_uint32x2_t;
--
--  out_uint32x4_t = vmlsq_lane_u32 (arg0_uint32x4_t, arg1_uint32x4_t, arg2_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmls\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQ_nf32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsQ_nf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsQ_nf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--  float32_t arg2_float32_t;
--
--  out_float32x4_t = vmlsq_n_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32_t);
--}
--
--/* { dg-final { scan-assembler "vmls\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQ_ns16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsQ_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsQ_ns16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--  int16_t arg2_int16_t;
--
--  out_int16x8_t = vmlsq_n_s16 (arg0_int16x8_t, arg1_int16x8_t, arg2_int16_t);
--}
--
--/* { dg-final { scan-assembler "vmls\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQ_ns32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsQ_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsQ_ns32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--  int32_t arg2_int32_t;
--
--  out_int32x4_t = vmlsq_n_s32 (arg0_int32x4_t, arg1_int32x4_t, arg2_int32_t);
--}
--
--/* { dg-final { scan-assembler "vmls\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQ_nu16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsQ_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsQ_nu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--  uint16_t arg2_uint16_t;
--
--  out_uint16x8_t = vmlsq_n_u16 (arg0_uint16x8_t, arg1_uint16x8_t, arg2_uint16_t);
--}
--
--/* { dg-final { scan-assembler "vmls\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQ_nu32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsQ_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsQ_nu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--  uint32_t arg2_uint32_t;
--
--  out_uint32x4_t = vmlsq_n_u32 (arg0_uint32x4_t, arg1_uint32x4_t, arg2_uint32_t);
--}
--
--/* { dg-final { scan-assembler "vmls\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQf32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsQf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--  float32x4_t arg2_float32x4_t;
--
--  out_float32x4_t = vmlsq_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vmls\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQs16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--  int16x8_t arg2_int16x8_t;
--
--  out_int16x8_t = vmlsq_s16 (arg0_int16x8_t, arg1_int16x8_t, arg2_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vmls\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQs32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--  int32x4_t arg2_int32x4_t;
--
--  out_int32x4_t = vmlsq_s32 (arg0_int32x4_t, arg1_int32x4_t, arg2_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vmls\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQs8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--  int8x16_t arg2_int8x16_t;
--
--  out_int8x16_t = vmlsq_s8 (arg0_int8x16_t, arg1_int8x16_t, arg2_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vmls\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQu16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--  uint16x8_t arg2_uint16x8_t;
--
--  out_uint16x8_t = vmlsq_u16 (arg0_uint16x8_t, arg1_uint16x8_t, arg2_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vmls\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQu32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--  uint32x4_t arg2_uint32x4_t;
--
--  out_uint32x4_t = vmlsq_u32 (arg0_uint32x4_t, arg1_uint32x4_t, arg2_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vmls\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsQu8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--  uint8x16_t arg2_uint8x16_t;
--
--  out_uint8x16_t = vmlsq_u8 (arg0_uint8x16_t, arg1_uint8x16_t, arg2_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vmls\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmls_lanef32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmls_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmls_lanef32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--  float32x2_t arg2_float32x2_t;
--
--  out_float32x2_t = vmls_lane_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmls\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmls_lanes16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmls_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmls_lanes16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--  int16x4_t arg2_int16x4_t;
--
--  out_int16x4_t = vmls_lane_s16 (arg0_int16x4_t, arg1_int16x4_t, arg2_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmls\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmls_lanes32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmls_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmls_lanes32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--  int32x2_t arg2_int32x2_t;
--
--  out_int32x2_t = vmls_lane_s32 (arg0_int32x2_t, arg1_int32x2_t, arg2_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmls\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmls_laneu16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmls_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmls_laneu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--  uint16x4_t arg2_uint16x4_t;
--
--  out_uint16x4_t = vmls_lane_u16 (arg0_uint16x4_t, arg1_uint16x4_t, arg2_uint16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmls\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmls_laneu32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmls_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmls_laneu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--  uint32x2_t arg2_uint32x2_t;
--
--  out_uint32x2_t = vmls_lane_u32 (arg0_uint32x2_t, arg1_uint32x2_t, arg2_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmls\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmls_nf32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmls_nf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmls_nf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--  float32_t arg2_float32_t;
--
--  out_float32x2_t = vmls_n_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32_t);
--}
--
--/* { dg-final { scan-assembler "vmls\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmls_ns16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmls_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmls_ns16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--  int16_t arg2_int16_t;
--
--  out_int16x4_t = vmls_n_s16 (arg0_int16x4_t, arg1_int16x4_t, arg2_int16_t);
--}
--
--/* { dg-final { scan-assembler "vmls\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmls_ns32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmls_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmls_ns32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--  int32_t arg2_int32_t;
--
--  out_int32x2_t = vmls_n_s32 (arg0_int32x2_t, arg1_int32x2_t, arg2_int32_t);
--}
--
--/* { dg-final { scan-assembler "vmls\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmls_nu16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmls_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmls_nu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--  uint16_t arg2_uint16_t;
--
--  out_uint16x4_t = vmls_n_u16 (arg0_uint16x4_t, arg1_uint16x4_t, arg2_uint16_t);
--}
--
--/* { dg-final { scan-assembler "vmls\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmls_nu32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmls_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmls_nu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--  uint32_t arg2_uint32_t;
--
--  out_uint32x2_t = vmls_n_u32 (arg0_uint32x2_t, arg1_uint32x2_t, arg2_uint32_t);
--}
--
--/* { dg-final { scan-assembler "vmls\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsf32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--  float32x2_t arg2_float32x2_t;
--
--  out_float32x2_t = vmls_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmls\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsl_lanes16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsl_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsl_lanes16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int16x4_t arg1_int16x4_t;
--  int16x4_t arg2_int16x4_t;
--
--  out_int32x4_t = vmlsl_lane_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmlsl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsl_lanes32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsl_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsl_lanes32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int32x2_t arg1_int32x2_t;
--  int32x2_t arg2_int32x2_t;
--
--  out_int64x2_t = vmlsl_lane_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmlsl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsl_laneu16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsl_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsl_laneu16 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint16x4_t arg1_uint16x4_t;
--  uint16x4_t arg2_uint16x4_t;
--
--  out_uint32x4_t = vmlsl_lane_u16 (arg0_uint32x4_t, arg1_uint16x4_t, arg2_uint16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmlsl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsl_laneu32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsl_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsl_laneu32 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint32x2_t arg1_uint32x2_t;
--  uint32x2_t arg2_uint32x2_t;
--
--  out_uint64x2_t = vmlsl_lane_u32 (arg0_uint64x2_t, arg1_uint32x2_t, arg2_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmlsl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsl_ns16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsl_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsl_ns16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int16x4_t arg1_int16x4_t;
--  int16_t arg2_int16_t;
--
--  out_int32x4_t = vmlsl_n_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16_t);
--}
--
--/* { dg-final { scan-assembler "vmlsl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsl_ns32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsl_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsl_ns32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int32x2_t arg1_int32x2_t;
--  int32_t arg2_int32_t;
--
--  out_int64x2_t = vmlsl_n_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32_t);
--}
--
--/* { dg-final { scan-assembler "vmlsl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsl_nu16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsl_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsl_nu16 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint16x4_t arg1_uint16x4_t;
--  uint16_t arg2_uint16_t;
--
--  out_uint32x4_t = vmlsl_n_u16 (arg0_uint32x4_t, arg1_uint16x4_t, arg2_uint16_t);
--}
--
--/* { dg-final { scan-assembler "vmlsl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsl_nu32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsl_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsl_nu32 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint32x2_t arg1_uint32x2_t;
--  uint32_t arg2_uint32_t;
--
--  out_uint64x2_t = vmlsl_n_u32 (arg0_uint64x2_t, arg1_uint32x2_t, arg2_uint32_t);
--}
--
--/* { dg-final { scan-assembler "vmlsl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsls16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsls16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsls16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int16x4_t arg1_int16x4_t;
--  int16x4_t arg2_int16x4_t;
--
--  out_int32x4_t = vmlsl_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vmlsl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsls32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsls32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsls32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int32x2_t arg1_int32x2_t;
--  int32x2_t arg2_int32x2_t;
--
--  out_int64x2_t = vmlsl_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmlsl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsls8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsls8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsls8 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int8x8_t arg1_int8x8_t;
--  int8x8_t arg2_int8x8_t;
--
--  out_int16x8_t = vmlsl_s8 (arg0_int16x8_t, arg1_int8x8_t, arg2_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vmlsl\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlslu16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlslu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlslu16 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint16x4_t arg1_uint16x4_t;
--  uint16x4_t arg2_uint16x4_t;
--
--  out_uint32x4_t = vmlsl_u16 (arg0_uint32x4_t, arg1_uint16x4_t, arg2_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vmlsl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlslu32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlslu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlslu32 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint32x2_t arg1_uint32x2_t;
--  uint32x2_t arg2_uint32x2_t;
--
--  out_uint64x2_t = vmlsl_u32 (arg0_uint64x2_t, arg1_uint32x2_t, arg2_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmlsl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlslu8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlslu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlslu8 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint8x8_t arg1_uint8x8_t;
--  uint8x8_t arg2_uint8x8_t;
--
--  out_uint16x8_t = vmlsl_u8 (arg0_uint16x8_t, arg1_uint8x8_t, arg2_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vmlsl\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlss16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlss16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlss16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--  int16x4_t arg2_int16x4_t;
--
--  out_int16x4_t = vmls_s16 (arg0_int16x4_t, arg1_int16x4_t, arg2_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vmls\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlss32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlss32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlss32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--  int32x2_t arg2_int32x2_t;
--
--  out_int32x2_t = vmls_s32 (arg0_int32x2_t, arg1_int32x2_t, arg2_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmls\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlss8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlss8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlss8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--  int8x8_t arg2_int8x8_t;
--
--  out_int8x8_t = vmls_s8 (arg0_int8x8_t, arg1_int8x8_t, arg2_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vmls\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsu16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--  uint16x4_t arg2_uint16x4_t;
--
--  out_uint16x4_t = vmls_u16 (arg0_uint16x4_t, arg1_uint16x4_t, arg2_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vmls\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsu32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--  uint32x2_t arg2_uint32x2_t;
--
--  out_uint32x2_t = vmls_u32 (arg0_uint32x2_t, arg1_uint32x2_t, arg2_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmls\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmlsu8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vmlsu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmlsu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--  uint8x8_t arg2_uint8x8_t;
--
--  out_uint8x8_t = vmls_u8 (arg0_uint8x8_t, arg1_uint8x8_t, arg2_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vmls\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmovQ_nf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmovQ_nf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmovQ_nf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32_t arg0_float32_t;
--
--  out_float32x4_t = vmovq_n_f32 (arg0_float32_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmovQ_np16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmovQ_np16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmovQ_np16 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  poly16_t arg0_poly16_t;
--
--  out_poly16x8_t = vmovq_n_p16 (arg0_poly16_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmovQ_np8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmovQ_np8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmovQ_np8 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  poly8_t arg0_poly8_t;
--
--  out_poly8x16_t = vmovq_n_p8 (arg0_poly8_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmovQ_ns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmovQ_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmovQ_ns16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16_t arg0_int16_t;
--
--  out_int16x8_t = vmovq_n_s16 (arg0_int16_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmovQ_ns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmovQ_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmovQ_ns32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32_t arg0_int32_t;
--
--  out_int32x4_t = vmovq_n_s32 (arg0_int32_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmovQ_ns64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vmovQ_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmovQ_ns64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64_t arg0_int64_t;
--
--  out_int64x2_t = vmovq_n_s64 (arg0_int64_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmovQ_ns8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmovQ_ns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmovQ_ns8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8_t arg0_int8_t;
--
--  out_int8x16_t = vmovq_n_s8 (arg0_int8_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmovQ_nu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmovQ_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmovQ_nu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16_t arg0_uint16_t;
--
--  out_uint16x8_t = vmovq_n_u16 (arg0_uint16_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmovQ_nu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmovQ_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmovQ_nu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32_t arg0_uint32_t;
--
--  out_uint32x4_t = vmovq_n_u32 (arg0_uint32_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmovQ_nu64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vmovQ_nu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmovQ_nu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64_t arg0_uint64_t;
--
--  out_uint64x2_t = vmovq_n_u64 (arg0_uint64_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmovQ_nu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmovQ_nu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmovQ_nu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8_t arg0_uint8_t;
--
--  out_uint8x16_t = vmovq_n_u8 (arg0_uint8_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmov_nf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmov_nf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmov_nf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32_t arg0_float32_t;
--
--  out_float32x2_t = vmov_n_f32 (arg0_float32_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmov_np16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmov_np16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmov_np16 (void)
--{
--  poly16x4_t out_poly16x4_t;
--  poly16_t arg0_poly16_t;
--
--  out_poly16x4_t = vmov_n_p16 (arg0_poly16_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmov_np8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmov_np8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmov_np8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly8_t arg0_poly8_t;
--
--  out_poly8x8_t = vmov_n_p8 (arg0_poly8_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmov_ns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmov_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmov_ns16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16_t arg0_int16_t;
--
--  out_int16x4_t = vmov_n_s16 (arg0_int16_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmov_ns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmov_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmov_ns32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32_t arg0_int32_t;
--
--  out_int32x2_t = vmov_n_s32 (arg0_int32_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmov_ns64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vmov_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmov_ns64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64_t arg0_int64_t;
--
--  out_int64x1_t = vmov_n_s64 (arg0_int64_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmov_ns8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmov_ns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmov_ns8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8_t arg0_int8_t;
--
--  out_int8x8_t = vmov_n_s8 (arg0_int8_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmov_nu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmov_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmov_nu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16_t arg0_uint16_t;
--
--  out_uint16x4_t = vmov_n_u16 (arg0_uint16_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmov_nu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmov_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmov_nu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32_t arg0_uint32_t;
--
--  out_uint32x2_t = vmov_n_u32 (arg0_uint32_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmov_nu64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vmov_nu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmov_nu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64_t arg0_uint64_t;
--
--  out_uint64x1_t = vmov_n_u64 (arg0_uint64_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmov_nu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmov_nu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmov_nu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8_t arg0_uint8_t;
--
--  out_uint8x8_t = vmov_n_u8 (arg0_uint8_t);
--}
--
--/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmovls16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmovls16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmovls16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_int32x4_t = vmovl_s16 (arg0_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vmovl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmovls32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmovls32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmovls32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_int64x2_t = vmovl_s32 (arg0_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmovl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmovls8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmovls8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmovls8 (void)
--{
--  int16x8_t out_int16x8_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_int16x8_t = vmovl_s8 (arg0_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vmovl\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmovlu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmovlu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmovlu16 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_uint32x4_t = vmovl_u16 (arg0_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vmovl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmovlu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmovlu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmovlu32 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_uint64x2_t = vmovl_u32 (arg0_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmovl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmovlu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmovlu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmovlu8 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_uint16x8_t = vmovl_u8 (arg0_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vmovl\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmovns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmovns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmovns16 (void)
--{
--  int8x8_t out_int8x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int8x8_t = vmovn_s16 (arg0_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vmovn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmovns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmovns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmovns32 (void)
--{
--  int16x4_t out_int16x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_int16x4_t = vmovn_s32 (arg0_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vmovn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmovns64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmovns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmovns64 (void)
--{
--  int32x2_t out_int32x2_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_int32x2_t = vmovn_s64 (arg0_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "vmovn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmovnu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmovnu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmovnu16 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_uint8x8_t = vmovn_u16 (arg0_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vmovn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmovnu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmovnu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmovnu32 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_uint16x4_t = vmovn_u32 (arg0_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vmovn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmovnu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmovnu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmovnu64 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint64x2_t arg0_uint64x2_t;
--
--  out_uint32x2_t = vmovn_u64 (arg0_uint64x2_t);
--}
--
--/* { dg-final { scan-assembler "vmovn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQ_lanef32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulQ_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulQ_lanef32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_float32x4_t = vmulq_lane_f32 (arg0_float32x4_t, arg1_float32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmul\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQ_lanes16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulQ_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulQ_lanes16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x8_t = vmulq_lane_s16 (arg0_int16x8_t, arg1_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmul\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQ_lanes32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulQ_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulQ_lanes32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x4_t = vmulq_lane_s32 (arg0_int32x4_t, arg1_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmul\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQ_laneu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulQ_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulQ_laneu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x8_t = vmulq_lane_u16 (arg0_uint16x8_t, arg1_uint16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmul\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQ_laneu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulQ_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulQ_laneu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x4_t = vmulq_lane_u32 (arg0_uint32x4_t, arg1_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmul\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQ_nf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulQ_nf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulQ_nf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32_t arg1_float32_t;
--
--  out_float32x4_t = vmulq_n_f32 (arg0_float32x4_t, arg1_float32_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQ_ns16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulQ_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulQ_ns16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16_t arg1_int16_t;
--
--  out_int16x8_t = vmulq_n_s16 (arg0_int16x8_t, arg1_int16_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQ_ns32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulQ_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulQ_ns32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32_t arg1_int32_t;
--
--  out_int32x4_t = vmulq_n_s32 (arg0_int32x4_t, arg1_int32_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQ_nu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulQ_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulQ_nu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16_t arg1_uint16_t;
--
--  out_uint16x8_t = vmulq_n_u16 (arg0_uint16x8_t, arg1_uint16_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQ_nu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulQ_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulQ_nu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32_t arg1_uint32_t;
--
--  out_uint32x4_t = vmulq_n_u32 (arg0_uint32x4_t, arg1_uint32_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulQf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--
--  out_float32x4_t = vmulq_f32 (arg0_float32x4_t, arg1_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQp8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulQp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulQp8 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  poly8x16_t arg0_poly8x16_t;
--  poly8x16_t arg1_poly8x16_t;
--
--  out_poly8x16_t = vmulq_p8 (arg0_poly8x16_t, arg1_poly8x16_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.p8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vmulq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vmulq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = vmulq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vmulq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vmulq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vmulq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmul_lanef32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmul_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmul_lanef32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_float32x2_t = vmul_lane_f32 (arg0_float32x2_t, arg1_float32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmul\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmul_lanes16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmul_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmul_lanes16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vmul_lane_s16 (arg0_int16x4_t, arg1_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmul\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmul_lanes32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmul_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmul_lanes32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vmul_lane_s32 (arg0_int32x2_t, arg1_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmul\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmul_laneu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmul_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmul_laneu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vmul_lane_u16 (arg0_uint16x4_t, arg1_uint16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmul\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmul_laneu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmul_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmul_laneu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vmul_lane_u32 (arg0_uint32x2_t, arg1_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmul\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmul_nf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmul_nf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmul_nf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32_t arg1_float32_t;
--
--  out_float32x2_t = vmul_n_f32 (arg0_float32x2_t, arg1_float32_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmul_ns16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmul_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmul_ns16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16_t arg1_int16_t;
--
--  out_int16x4_t = vmul_n_s16 (arg0_int16x4_t, arg1_int16_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmul_ns32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmul_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmul_ns32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32_t arg1_int32_t;
--
--  out_int32x2_t = vmul_n_s32 (arg0_int32x2_t, arg1_int32_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmul_nu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmul_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmul_nu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16_t arg1_uint16_t;
--
--  out_uint16x4_t = vmul_n_u16 (arg0_uint16x4_t, arg1_uint16_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmul_nu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmul_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmul_nu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32_t arg1_uint32_t;
--
--  out_uint32x2_t = vmul_n_u32 (arg0_uint32x2_t, arg1_uint32_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_float32x2_t = vmul_f32 (arg0_float32x2_t, arg1_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmull_lanes16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmull_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmull_lanes16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int32x4_t = vmull_lane_s16 (arg0_int16x4_t, arg1_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmull\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmull_lanes32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmull_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmull_lanes32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int64x2_t = vmull_lane_s32 (arg0_int32x2_t, arg1_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmull\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmull_laneu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmull_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmull_laneu16 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint32x4_t = vmull_lane_u16 (arg0_uint16x4_t, arg1_uint16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmull\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmull_laneu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmull_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmull_laneu32 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint64x2_t = vmull_lane_u32 (arg0_uint32x2_t, arg1_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmull\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmull_ns16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmull_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmull_ns16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16_t arg1_int16_t;
--
--  out_int32x4_t = vmull_n_s16 (arg0_int16x4_t, arg1_int16_t);
--}
--
--/* { dg-final { scan-assembler "vmull\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmull_ns32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmull_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmull_ns32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32_t arg1_int32_t;
--
--  out_int64x2_t = vmull_n_s32 (arg0_int32x2_t, arg1_int32_t);
--}
--
--/* { dg-final { scan-assembler "vmull\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmull_nu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmull_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmull_nu16 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16_t arg1_uint16_t;
--
--  out_uint32x4_t = vmull_n_u16 (arg0_uint16x4_t, arg1_uint16_t);
--}
--
--/* { dg-final { scan-assembler "vmull\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmull_nu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmull_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmull_nu32 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32_t arg1_uint32_t;
--
--  out_uint64x2_t = vmull_n_u32 (arg0_uint32x2_t, arg1_uint32_t);
--}
--
--/* { dg-final { scan-assembler "vmull\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmullp8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmullp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmullp8 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  poly8x8_t arg0_poly8x8_t;
--  poly8x8_t arg1_poly8x8_t;
--
--  out_poly16x8_t = vmull_p8 (arg0_poly8x8_t, arg1_poly8x8_t);
--}
--
--/* { dg-final { scan-assembler "vmull\.p8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulls16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulls16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulls16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int32x4_t = vmull_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vmull\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulls32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulls32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulls32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int64x2_t = vmull_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmull\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulls8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulls8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulls8 (void)
--{
--  int16x8_t out_int16x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int16x8_t = vmull_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vmull\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmullu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmullu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmullu16 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint32x4_t = vmull_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vmull\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmullu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmullu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmullu32 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint64x2_t = vmull_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmull\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmullu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmullu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmullu8 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint16x8_t = vmull_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vmull\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulp8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulp8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly8x8_t arg0_poly8x8_t;
--  poly8x8_t arg1_poly8x8_t;
--
--  out_poly8x8_t = vmul_p8 (arg0_poly8x8_t, arg1_poly8x8_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.p8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmuls16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmuls16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmuls16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vmul_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmuls32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmuls32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmuls32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vmul_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmuls8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmuls8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmuls8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vmul_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vmul_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vmul_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmulu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vmulu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmulu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vmul_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vmul\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmvnQp8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmvnQp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmvnQp8 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  poly8x16_t arg0_poly8x16_t;
--
--  out_poly8x16_t = vmvnq_p8 (arg0_poly8x16_t);
--}
--
--/* { dg-final { scan-assembler "vmvn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmvnQs16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmvnQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmvnQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int16x8_t = vmvnq_s16 (arg0_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vmvn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmvnQs32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmvnQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmvnQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_int32x4_t = vmvnq_s32 (arg0_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vmvn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmvnQs8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmvnQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmvnQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_int8x16_t = vmvnq_s8 (arg0_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vmvn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmvnQu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmvnQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmvnQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_uint16x8_t = vmvnq_u16 (arg0_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vmvn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmvnQu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmvnQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmvnQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_uint32x4_t = vmvnq_u32 (arg0_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vmvn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmvnQu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmvnQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmvnQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_uint8x16_t = vmvnq_u8 (arg0_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vmvn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmvnp8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmvnp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmvnp8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly8x8_t arg0_poly8x8_t;
--
--  out_poly8x8_t = vmvn_p8 (arg0_poly8x8_t);
--}
--
--/* { dg-final { scan-assembler "vmvn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmvns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmvns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmvns16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_int16x4_t = vmvn_s16 (arg0_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vmvn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmvns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmvns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmvns32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_int32x2_t = vmvn_s32 (arg0_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmvn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmvns8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmvns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmvns8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_int8x8_t = vmvn_s8 (arg0_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vmvn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmvnu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmvnu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmvnu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_uint16x4_t = vmvn_u16 (arg0_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vmvn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmvnu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmvnu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmvnu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_uint32x2_t = vmvn_u32 (arg0_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vmvn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vmvnu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vmvnu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vmvnu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_uint8x8_t = vmvn_u8 (arg0_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vmvn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vnegQf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vnegQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vnegQf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_float32x4_t = vnegq_f32 (arg0_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vneg\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vnegQs16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vnegQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vnegQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int16x8_t = vnegq_s16 (arg0_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vneg\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vnegQs32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vnegQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vnegQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_int32x4_t = vnegq_s32 (arg0_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vneg\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vnegQs8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vnegQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vnegQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_int8x16_t = vnegq_s8 (arg0_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vneg\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vnegf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vnegf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vnegf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_float32x2_t = vneg_f32 (arg0_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vneg\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vnegs16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vnegs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vnegs16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_int16x4_t = vneg_s16 (arg0_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vneg\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vnegs32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vnegs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vnegs32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_int32x2_t = vneg_s32 (arg0_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vneg\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vnegs8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vnegs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vnegs8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_int8x8_t = vneg_s8 (arg0_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vneg\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vornQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vornQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--int16x8_t out_int16x8_t;
--int16x8_t arg0_int16x8_t;
--int16x8_t arg1_int16x8_t;
--void test_vornQs16 (void)
--{
--
--  out_int16x8_t = vornq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vorn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vornQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vornQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--int32x4_t out_int32x4_t;
--int32x4_t arg0_int32x4_t;
--int32x4_t arg1_int32x4_t;
--void test_vornQs32 (void)
--{
--
--  out_int32x4_t = vornq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vorn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vornQs64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vornQs64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--int64x2_t out_int64x2_t;
--int64x2_t arg0_int64x2_t;
--int64x2_t arg1_int64x2_t;
--void test_vornQs64 (void)
--{
--
--  out_int64x2_t = vornq_s64 (arg0_int64x2_t, arg1_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "vorn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vornQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vornQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--int8x16_t out_int8x16_t;
--int8x16_t arg0_int8x16_t;
--int8x16_t arg1_int8x16_t;
--void test_vornQs8 (void)
--{
--
--  out_int8x16_t = vornq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vorn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vornQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vornQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--uint16x8_t out_uint16x8_t;
--uint16x8_t arg0_uint16x8_t;
--uint16x8_t arg1_uint16x8_t;
--void test_vornQu16 (void)
--{
--
--  out_uint16x8_t = vornq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vorn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vornQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vornQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--uint32x4_t out_uint32x4_t;
--uint32x4_t arg0_uint32x4_t;
--uint32x4_t arg1_uint32x4_t;
--void test_vornQu32 (void)
--{
--
--  out_uint32x4_t = vornq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vorn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vornQu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vornQu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--uint64x2_t out_uint64x2_t;
--uint64x2_t arg0_uint64x2_t;
--uint64x2_t arg1_uint64x2_t;
--void test_vornQu64 (void)
--{
--
--  out_uint64x2_t = vornq_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
--}
--
--/* { dg-final { scan-assembler "vorn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vornQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vornQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--uint8x16_t out_uint8x16_t;
--uint8x16_t arg0_uint8x16_t;
--uint8x16_t arg1_uint8x16_t;
--void test_vornQu8 (void)
--{
--
--  out_uint8x16_t = vornq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vorn\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vorns16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vorns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--int16x4_t out_int16x4_t;
--int16x4_t arg0_int16x4_t;
--int16x4_t arg1_int16x4_t;
--void test_vorns16 (void)
--{
--
--  out_int16x4_t = vorn_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vorn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vorns32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vorns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--int32x2_t out_int32x2_t;
--int32x2_t arg0_int32x2_t;
--int32x2_t arg1_int32x2_t;
--void test_vorns32 (void)
--{
--
--  out_int32x2_t = vorn_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vorn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vorns64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vorns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--int64x1_t out_int64x1_t;
--int64x1_t arg0_int64x1_t;
--int64x1_t arg1_int64x1_t;
--void test_vorns64 (void)
--{
--
--  out_int64x1_t = vorn_s64 (arg0_int64x1_t, arg1_int64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vorns8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vorns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--int8x8_t out_int8x8_t;
--int8x8_t arg0_int8x8_t;
--int8x8_t arg1_int8x8_t;
--void test_vorns8 (void)
--{
--
--  out_int8x8_t = vorn_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vorn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vornu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vornu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--uint16x4_t out_uint16x4_t;
--uint16x4_t arg0_uint16x4_t;
--uint16x4_t arg1_uint16x4_t;
--void test_vornu16 (void)
--{
--
--  out_uint16x4_t = vorn_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vorn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vornu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vornu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--uint32x2_t out_uint32x2_t;
--uint32x2_t arg0_uint32x2_t;
--uint32x2_t arg1_uint32x2_t;
--void test_vornu32 (void)
--{
--
--  out_uint32x2_t = vorn_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vorn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vornu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vornu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--uint64x1_t out_uint64x1_t;
--uint64x1_t arg0_uint64x1_t;
--uint64x1_t arg1_uint64x1_t;
--void test_vornu64 (void)
--{
--
--  out_uint64x1_t = vorn_u64 (arg0_uint64x1_t, arg1_uint64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vornu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vornu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O2" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--uint8x8_t out_uint8x8_t;
--uint8x8_t arg0_uint8x8_t;
--uint8x8_t arg1_uint8x8_t;
--void test_vornu8 (void)
--{
--
--  out_uint8x8_t = vorn_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vorn\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vorrQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vorrQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vorrQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vorrq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vorr\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vorrQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vorrQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vorrQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vorrq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vorr\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vorrQs64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vorrQs64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vorrQs64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_int64x2_t = vorrq_s64 (arg0_int64x2_t, arg1_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "vorr\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vorrQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vorrQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vorrQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = vorrq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vorr\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vorrQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vorrQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vorrQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vorrq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vorr\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vorrQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vorrQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vorrQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vorrq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vorr\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vorrQu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vorrQu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vorrQu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint64x2_t arg1_uint64x2_t;
--
--  out_uint64x2_t = vorrq_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
--}
--
--/* { dg-final { scan-assembler "vorr\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vorrQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vorrQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vorrQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vorrq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vorr\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vorrs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vorrs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vorrs16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vorr_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vorr\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vorrs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vorrs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vorrs32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vorr_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vorr\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vorrs64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vorrs64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vorrs64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x1_t arg0_int64x1_t;
--  int64x1_t arg1_int64x1_t;
--
--  out_int64x1_t = vorr_s64 (arg0_int64x1_t, arg1_int64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vorrs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vorrs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vorrs8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vorr_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vorr\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vorru16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vorru16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vorru16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vorr_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vorr\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vorru32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vorru32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vorru32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vorr_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vorr\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vorru64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vorru64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vorru64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--  uint64x1_t arg1_uint64x1_t;
--
--  out_uint64x1_t = vorr_u64 (arg0_uint64x1_t, arg1_uint64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vorru8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vorru8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vorru8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vorr_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vorr\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpadalQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpadalQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpadalQs16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int32x4_t = vpadalq_s16 (arg0_int32x4_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vpadal\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpadalQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpadalQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpadalQs32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int64x2_t = vpadalq_s32 (arg0_int64x2_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vpadal\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpadalQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpadalQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpadalQs8 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int16x8_t = vpadalq_s8 (arg0_int16x8_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vpadal\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpadalQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpadalQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpadalQu16 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint32x4_t = vpadalq_u16 (arg0_uint32x4_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vpadal\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpadalQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpadalQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpadalQu32 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint64x2_t = vpadalq_u32 (arg0_uint64x2_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vpadal\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpadalQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpadalQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpadalQu8 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint16x8_t = vpadalq_u8 (arg0_uint16x8_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vpadal\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpadals16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpadals16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpadals16 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int32x2_t = vpadal_s16 (arg0_int32x2_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vpadal\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpadals32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpadals32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpadals32 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x1_t arg0_int64x1_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int64x1_t = vpadal_s32 (arg0_int64x1_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vpadal\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpadals8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpadals8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpadals8 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int16x4_t = vpadal_s8 (arg0_int16x4_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vpadal\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpadalu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpadalu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpadalu16 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint32x2_t = vpadal_u16 (arg0_uint32x2_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vpadal\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpadalu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpadalu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpadalu32 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint64x1_t = vpadal_u32 (arg0_uint64x1_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vpadal\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpadalu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpadalu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpadalu8 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint16x4_t = vpadal_u8 (arg0_uint16x4_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vpadal\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpaddf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpaddf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_float32x2_t = vpadd_f32 (arg0_float32x2_t, arg1_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vpadd\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddlQs16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vpaddlQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpaddlQs16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int32x4_t = vpaddlq_s16 (arg0_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vpaddl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddlQs32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vpaddlQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpaddlQs32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_int64x2_t = vpaddlq_s32 (arg0_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vpaddl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddlQs8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vpaddlQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpaddlQs8 (void)
--{
--  int16x8_t out_int16x8_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_int16x8_t = vpaddlq_s8 (arg0_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vpaddl\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddlQu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vpaddlQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpaddlQu16 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_uint32x4_t = vpaddlq_u16 (arg0_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vpaddl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddlQu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vpaddlQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpaddlQu32 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_uint64x2_t = vpaddlq_u32 (arg0_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vpaddl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddlQu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vpaddlQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpaddlQu8 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_uint16x8_t = vpaddlq_u8 (arg0_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vpaddl\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddls16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vpaddls16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpaddls16 (void)
--{
--  int32x2_t out_int32x2_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_int32x2_t = vpaddl_s16 (arg0_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vpaddl\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddls32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vpaddls32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpaddls32 (void)
--{
--  int64x1_t out_int64x1_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_int64x1_t = vpaddl_s32 (arg0_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vpaddl\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddls8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vpaddls8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpaddls8 (void)
--{
--  int16x4_t out_int16x4_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_int16x4_t = vpaddl_s8 (arg0_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vpaddl\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddlu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vpaddlu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpaddlu16 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_uint32x2_t = vpaddl_u16 (arg0_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vpaddl\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddlu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vpaddlu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpaddlu32 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_uint64x1_t = vpaddl_u32 (arg0_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vpaddl\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddlu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vpaddlu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpaddlu8 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_uint16x4_t = vpaddl_u8 (arg0_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vpaddl\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpadds16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpadds16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpadds16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vpadd_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vpadd\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpadds32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpadds32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpadds32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vpadd_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vpadd\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpadds8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpadds8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpadds8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vpadd_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vpadd\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpaddu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpaddu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vpadd_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vpadd\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpaddu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpaddu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vpadd_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vpadd\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpaddu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpaddu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpaddu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vpadd_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vpadd\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpmaxf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpmaxf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpmaxf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_float32x2_t = vpmax_f32 (arg0_float32x2_t, arg1_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vpmax\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpmaxs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpmaxs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpmaxs16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vpmax_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vpmax\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpmaxs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpmaxs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpmaxs32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vpmax_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vpmax\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpmaxs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpmaxs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpmaxs8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vpmax_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vpmax\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpmaxu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpmaxu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpmaxu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vpmax_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vpmax\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpmaxu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpmaxu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpmaxu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vpmax_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vpmax\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpmaxu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpmaxu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpmaxu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vpmax_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vpmax\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpminf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpminf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpminf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_float32x2_t = vpmin_f32 (arg0_float32x2_t, arg1_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vpmin\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpmins16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpmins16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpmins16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vpmin_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vpmin\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpmins32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpmins32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpmins32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vpmin_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vpmin\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpmins8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpmins8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpmins8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vpmin_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vpmin\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpminu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpminu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpminu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vpmin_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vpmin\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpminu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpminu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpminu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vpmin_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vpmin\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vpminu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vpminu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vpminu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vpmin_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vpmin\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRdmulhQ_lanes16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRdmulhQ_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRdmulhQ_lanes16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x8_t = vqrdmulhq_lane_s16 (arg0_int16x8_t, arg1_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqrdmulh\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRdmulhQ_lanes32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRdmulhQ_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRdmulhQ_lanes32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x4_t = vqrdmulhq_lane_s32 (arg0_int32x4_t, arg1_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqrdmulh\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRdmulhQ_ns16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRdmulhQ_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRdmulhQ_ns16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16_t arg1_int16_t;
--
--  out_int16x8_t = vqrdmulhq_n_s16 (arg0_int16x8_t, arg1_int16_t);
--}
--
--/* { dg-final { scan-assembler "vqrdmulh\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRdmulhQ_ns32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRdmulhQ_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRdmulhQ_ns32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32_t arg1_int32_t;
--
--  out_int32x4_t = vqrdmulhq_n_s32 (arg0_int32x4_t, arg1_int32_t);
--}
--
--/* { dg-final { scan-assembler "vqrdmulh\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRdmulhQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRdmulhQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRdmulhQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vqrdmulhq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vqrdmulh\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRdmulhQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRdmulhQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRdmulhQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vqrdmulhq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vqrdmulh\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRdmulh_lanes16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRdmulh_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRdmulh_lanes16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vqrdmulh_lane_s16 (arg0_int16x4_t, arg1_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqrdmulh\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRdmulh_lanes32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRdmulh_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRdmulh_lanes32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vqrdmulh_lane_s32 (arg0_int32x2_t, arg1_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqrdmulh\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRdmulh_ns16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRdmulh_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRdmulh_ns16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16_t arg1_int16_t;
--
--  out_int16x4_t = vqrdmulh_n_s16 (arg0_int16x4_t, arg1_int16_t);
--}
--
--/* { dg-final { scan-assembler "vqrdmulh\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRdmulh_ns32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRdmulh_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRdmulh_ns32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32_t arg1_int32_t;
--
--  out_int32x2_t = vqrdmulh_n_s32 (arg0_int32x2_t, arg1_int32_t);
--}
--
--/* { dg-final { scan-assembler "vqrdmulh\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRdmulhs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRdmulhs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRdmulhs16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vqrdmulh_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vqrdmulh\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRdmulhs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRdmulhs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRdmulhs32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vqrdmulh_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vqrdmulh\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshlQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRshlQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshlQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vqrshlq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vqrshl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshlQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRshlQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshlQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vqrshlq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vqrshl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshlQs64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRshlQs64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshlQs64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_int64x2_t = vqrshlq_s64 (arg0_int64x2_t, arg1_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "vqrshl\.s64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshlQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRshlQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshlQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = vqrshlq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vqrshl\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshlQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRshlQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshlQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_uint16x8_t = vqrshlq_u16 (arg0_uint16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vqrshl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshlQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRshlQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshlQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_uint32x4_t = vqrshlq_u32 (arg0_uint32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vqrshl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshlQu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRshlQu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshlQu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_uint64x2_t = vqrshlq_u64 (arg0_uint64x2_t, arg1_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "vqrshl\.u64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshlQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRshlQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshlQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_uint8x16_t = vqrshlq_u8 (arg0_uint8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vqrshl\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshls16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRshls16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshls16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vqrshl_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vqrshl\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshls32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRshls32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshls32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vqrshl_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vqrshl\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshls64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRshls64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshls64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x1_t arg0_int64x1_t;
--  int64x1_t arg1_int64x1_t;
--
--  out_int64x1_t = vqrshl_s64 (arg0_int64x1_t, arg1_int64x1_t);
--}
--
--/* { dg-final { scan-assembler "vqrshl\.s64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshls8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRshls8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshls8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vqrshl_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vqrshl\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshlu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRshlu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshlu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_uint16x4_t = vqrshl_u16 (arg0_uint16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vqrshl\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshlu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRshlu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshlu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_uint32x2_t = vqrshl_u32 (arg0_uint32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vqrshl\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshlu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRshlu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshlu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--  int64x1_t arg1_int64x1_t;
--
--  out_uint64x1_t = vqrshl_u64 (arg0_uint64x1_t, arg1_int64x1_t);
--}
--
--/* { dg-final { scan-assembler "vqrshl\.u64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshlu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqRshlu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshlu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_uint8x8_t = vqrshl_u8 (arg0_uint8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vqrshl\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshrn_ns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqRshrn_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshrn_ns16 (void)
--{
--  int8x8_t out_int8x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int8x8_t = vqrshrn_n_s16 (arg0_int16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqrshrn\.s16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshrn_ns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqRshrn_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshrn_ns32 (void)
--{
--  int16x4_t out_int16x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_int16x4_t = vqrshrn_n_s32 (arg0_int32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqrshrn\.s32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshrn_ns64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqRshrn_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshrn_ns64 (void)
--{
--  int32x2_t out_int32x2_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_int32x2_t = vqrshrn_n_s64 (arg0_int64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqrshrn\.s64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshrn_nu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqRshrn_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshrn_nu16 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_uint8x8_t = vqrshrn_n_u16 (arg0_uint16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqrshrn\.u16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshrn_nu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqRshrn_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshrn_nu32 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_uint16x4_t = vqrshrn_n_u32 (arg0_uint32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqrshrn\.u32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshrn_nu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqRshrn_nu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshrn_nu64 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint64x2_t arg0_uint64x2_t;
--
--  out_uint32x2_t = vqrshrn_n_u64 (arg0_uint64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqrshrn\.u64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshrun_ns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqRshrun_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshrun_ns16 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_uint8x8_t = vqrshrun_n_s16 (arg0_int16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqrshrun\.s16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshrun_ns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqRshrun_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshrun_ns32 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_uint16x4_t = vqrshrun_n_s32 (arg0_int32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqrshrun\.s32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqRshrun_ns64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqRshrun_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqRshrun_ns64 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_uint32x2_t = vqrshrun_n_s64 (arg0_int64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqrshrun\.s64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqabsQs16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqabsQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqabsQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int16x8_t = vqabsq_s16 (arg0_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vqabs\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqabsQs32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqabsQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqabsQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_int32x4_t = vqabsq_s32 (arg0_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vqabs\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqabsQs8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqabsQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqabsQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_int8x16_t = vqabsq_s8 (arg0_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vqabs\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqabss16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqabss16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqabss16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_int16x4_t = vqabs_s16 (arg0_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vqabs\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqabss32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqabss32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqabss32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_int32x2_t = vqabs_s32 (arg0_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vqabs\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqabss8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqabss8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqabss8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_int8x8_t = vqabs_s8 (arg0_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vqabs\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqaddQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqaddQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqaddQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vqaddq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vqadd\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqaddQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqaddQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqaddQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vqaddq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vqadd\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqaddQs64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqaddQs64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqaddQs64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_int64x2_t = vqaddq_s64 (arg0_int64x2_t, arg1_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "vqadd\.s64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqaddQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqaddQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqaddQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = vqaddq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vqadd\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqaddQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqaddQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqaddQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vqaddq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vqadd\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqaddQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqaddQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqaddQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vqaddq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vqadd\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqaddQu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqaddQu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqaddQu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint64x2_t arg1_uint64x2_t;
--
--  out_uint64x2_t = vqaddq_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
--}
--
--/* { dg-final { scan-assembler "vqadd\.u64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqaddQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqaddQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqaddQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vqaddq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vqadd\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqadds16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqadds16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqadds16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vqadd_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vqadd\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqadds32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqadds32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqadds32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vqadd_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vqadd\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqadds64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqadds64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqadds64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x1_t arg0_int64x1_t;
--  int64x1_t arg1_int64x1_t;
--
--  out_int64x1_t = vqadd_s64 (arg0_int64x1_t, arg1_int64x1_t);
--}
--
--/* { dg-final { scan-assembler "vqadd\.s64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqadds8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqadds8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqadds8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vqadd_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vqadd\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqaddu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqaddu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqaddu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vqadd_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vqadd\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqaddu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqaddu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqaddu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vqadd_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vqadd\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqaddu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqaddu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqaddu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--  uint64x1_t arg1_uint64x1_t;
--
--  out_uint64x1_t = vqadd_u64 (arg0_uint64x1_t, arg1_uint64x1_t);
--}
--
--/* { dg-final { scan-assembler "vqadd\.u64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqaddu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqaddu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqaddu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vqadd_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vqadd\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmlal_lanes16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vqdmlal_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmlal_lanes16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int16x4_t arg1_int16x4_t;
--  int16x4_t arg2_int16x4_t;
--
--  out_int32x4_t = vqdmlal_lane_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqdmlal\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmlal_lanes32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vqdmlal_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmlal_lanes32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int32x2_t arg1_int32x2_t;
--  int32x2_t arg2_int32x2_t;
--
--  out_int64x2_t = vqdmlal_lane_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqdmlal\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmlal_ns16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vqdmlal_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmlal_ns16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int16x4_t arg1_int16x4_t;
--  int16_t arg2_int16_t;
--
--  out_int32x4_t = vqdmlal_n_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16_t);
--}
--
--/* { dg-final { scan-assembler "vqdmlal\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmlal_ns32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vqdmlal_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmlal_ns32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int32x2_t arg1_int32x2_t;
--  int32_t arg2_int32_t;
--
--  out_int64x2_t = vqdmlal_n_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32_t);
--}
--
--/* { dg-final { scan-assembler "vqdmlal\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmlals16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vqdmlals16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmlals16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int16x4_t arg1_int16x4_t;
--  int16x4_t arg2_int16x4_t;
--
--  out_int32x4_t = vqdmlal_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vqdmlal\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmlals32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vqdmlals32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmlals32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int32x2_t arg1_int32x2_t;
--  int32x2_t arg2_int32x2_t;
--
--  out_int64x2_t = vqdmlal_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vqdmlal\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmlsl_lanes16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vqdmlsl_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmlsl_lanes16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int16x4_t arg1_int16x4_t;
--  int16x4_t arg2_int16x4_t;
--
--  out_int32x4_t = vqdmlsl_lane_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqdmlsl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmlsl_lanes32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vqdmlsl_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmlsl_lanes32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int32x2_t arg1_int32x2_t;
--  int32x2_t arg2_int32x2_t;
--
--  out_int64x2_t = vqdmlsl_lane_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqdmlsl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmlsl_ns16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vqdmlsl_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmlsl_ns16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int16x4_t arg1_int16x4_t;
--  int16_t arg2_int16_t;
--
--  out_int32x4_t = vqdmlsl_n_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16_t);
--}
--
--/* { dg-final { scan-assembler "vqdmlsl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmlsl_ns32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vqdmlsl_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmlsl_ns32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int32x2_t arg1_int32x2_t;
--  int32_t arg2_int32_t;
--
--  out_int64x2_t = vqdmlsl_n_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32_t);
--}
--
--/* { dg-final { scan-assembler "vqdmlsl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmlsls16.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vqdmlsls16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmlsls16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int16x4_t arg1_int16x4_t;
--  int16x4_t arg2_int16x4_t;
--
--  out_int32x4_t = vqdmlsl_s16 (arg0_int32x4_t, arg1_int16x4_t, arg2_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vqdmlsl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmlsls32.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vqdmlsls32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmlsls32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int32x2_t arg1_int32x2_t;
--  int32x2_t arg2_int32x2_t;
--
--  out_int64x2_t = vqdmlsl_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vqdmlsl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulhQ_lanes16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqdmulhQ_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmulhQ_lanes16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x8_t = vqdmulhq_lane_s16 (arg0_int16x8_t, arg1_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqdmulh\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulhQ_lanes32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqdmulhQ_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmulhQ_lanes32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x4_t = vqdmulhq_lane_s32 (arg0_int32x4_t, arg1_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqdmulh\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulhQ_ns16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqdmulhQ_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmulhQ_ns16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16_t arg1_int16_t;
--
--  out_int16x8_t = vqdmulhq_n_s16 (arg0_int16x8_t, arg1_int16_t);
--}
--
--/* { dg-final { scan-assembler "vqdmulh\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulhQ_ns32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqdmulhQ_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmulhQ_ns32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32_t arg1_int32_t;
--
--  out_int32x4_t = vqdmulhq_n_s32 (arg0_int32x4_t, arg1_int32_t);
--}
--
--/* { dg-final { scan-assembler "vqdmulh\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulhQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqdmulhQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmulhQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vqdmulhq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vqdmulh\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulhQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqdmulhQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmulhQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vqdmulhq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vqdmulh\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulh_lanes16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqdmulh_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmulh_lanes16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vqdmulh_lane_s16 (arg0_int16x4_t, arg1_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqdmulh\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulh_lanes32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqdmulh_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmulh_lanes32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vqdmulh_lane_s32 (arg0_int32x2_t, arg1_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqdmulh\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulh_ns16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqdmulh_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmulh_ns16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16_t arg1_int16_t;
--
--  out_int16x4_t = vqdmulh_n_s16 (arg0_int16x4_t, arg1_int16_t);
--}
--
--/* { dg-final { scan-assembler "vqdmulh\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulh_ns32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqdmulh_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmulh_ns32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32_t arg1_int32_t;
--
--  out_int32x2_t = vqdmulh_n_s32 (arg0_int32x2_t, arg1_int32_t);
--}
--
--/* { dg-final { scan-assembler "vqdmulh\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulhs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqdmulhs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmulhs16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vqdmulh_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vqdmulh\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulhs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqdmulhs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmulhs32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vqdmulh_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vqdmulh\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmull_lanes16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqdmull_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmull_lanes16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int32x4_t = vqdmull_lane_s16 (arg0_int16x4_t, arg1_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqdmull\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmull_lanes32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqdmull_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmull_lanes32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int64x2_t = vqdmull_lane_s32 (arg0_int32x2_t, arg1_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqdmull\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmull_ns16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqdmull_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmull_ns16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16_t arg1_int16_t;
--
--  out_int32x4_t = vqdmull_n_s16 (arg0_int16x4_t, arg1_int16_t);
--}
--
--/* { dg-final { scan-assembler "vqdmull\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmull_ns32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqdmull_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmull_ns32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32_t arg1_int32_t;
--
--  out_int64x2_t = vqdmull_n_s32 (arg0_int32x2_t, arg1_int32_t);
--}
--
--/* { dg-final { scan-assembler "vqdmull\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulls16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqdmulls16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmulls16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int32x4_t = vqdmull_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vqdmull\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqdmulls32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqdmulls32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqdmulls32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int64x2_t = vqdmull_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vqdmull\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqmovns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqmovns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqmovns16 (void)
--{
--  int8x8_t out_int8x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int8x8_t = vqmovn_s16 (arg0_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vqmovn\.s16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqmovns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqmovns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqmovns32 (void)
--{
--  int16x4_t out_int16x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_int16x4_t = vqmovn_s32 (arg0_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vqmovn\.s32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqmovns64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqmovns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqmovns64 (void)
--{
--  int32x2_t out_int32x2_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_int32x2_t = vqmovn_s64 (arg0_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "vqmovn\.s64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqmovnu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqmovnu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqmovnu16 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_uint8x8_t = vqmovn_u16 (arg0_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vqmovn\.u16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqmovnu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqmovnu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqmovnu32 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_uint16x4_t = vqmovn_u32 (arg0_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vqmovn\.u32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqmovnu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqmovnu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqmovnu64 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint64x2_t arg0_uint64x2_t;
--
--  out_uint32x2_t = vqmovn_u64 (arg0_uint64x2_t);
--}
--
--/* { dg-final { scan-assembler "vqmovn\.u64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqmovuns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqmovuns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqmovuns16 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_uint8x8_t = vqmovun_s16 (arg0_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vqmovun\.s16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqmovuns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqmovuns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqmovuns32 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_uint16x4_t = vqmovun_s32 (arg0_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vqmovun\.s32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqmovuns64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqmovuns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqmovuns64 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_uint32x2_t = vqmovun_s64 (arg0_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "vqmovun\.s64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqnegQs16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqnegQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqnegQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int16x8_t = vqnegq_s16 (arg0_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vqneg\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqnegQs32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqnegQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqnegQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_int32x4_t = vqnegq_s32 (arg0_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vqneg\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqnegQs8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqnegQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqnegQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_int8x16_t = vqnegq_s8 (arg0_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vqneg\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqnegs16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqnegs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqnegs16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_int16x4_t = vqneg_s16 (arg0_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vqneg\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqnegs32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqnegs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqnegs32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_int32x2_t = vqneg_s32 (arg0_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vqneg\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqnegs8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqnegs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqnegs8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_int8x8_t = vqneg_s8 (arg0_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vqneg\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQ_ns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshlQ_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshlQ_ns16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int16x8_t = vqshlq_n_s16 (arg0_int16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQ_ns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshlQ_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshlQ_ns32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_int32x4_t = vqshlq_n_s32 (arg0_int32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQ_ns64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshlQ_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshlQ_ns64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_int64x2_t = vqshlq_n_s64 (arg0_int64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshl\.s64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQ_ns8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshlQ_ns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshlQ_ns8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_int8x16_t = vqshlq_n_s8 (arg0_int8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshl\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQ_nu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshlQ_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshlQ_nu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_uint16x8_t = vqshlq_n_u16 (arg0_uint16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQ_nu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshlQ_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshlQ_nu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_uint32x4_t = vqshlq_n_u32 (arg0_uint32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQ_nu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshlQ_nu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshlQ_nu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--
--  out_uint64x2_t = vqshlq_n_u64 (arg0_uint64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshl\.u64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQ_nu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshlQ_nu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshlQ_nu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_uint8x16_t = vqshlq_n_u8 (arg0_uint8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshl\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqshlQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshlQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vqshlq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vqshl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqshlQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshlQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vqshlq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vqshl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQs64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqshlQs64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshlQs64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_int64x2_t = vqshlq_s64 (arg0_int64x2_t, arg1_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "vqshl\.s64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqshlQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshlQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = vqshlq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vqshl\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqshlQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshlQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_uint16x8_t = vqshlq_u16 (arg0_uint16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vqshl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqshlQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshlQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_uint32x4_t = vqshlq_u32 (arg0_uint32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vqshl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqshlQu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshlQu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_uint64x2_t = vqshlq_u64 (arg0_uint64x2_t, arg1_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "vqshl\.u64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqshlQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshlQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_uint8x16_t = vqshlq_u8 (arg0_uint8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vqshl\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshl_ns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshl_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshl_ns16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_int16x4_t = vqshl_n_s16 (arg0_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshl\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshl_ns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshl_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshl_ns32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_int32x2_t = vqshl_n_s32 (arg0_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshl\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshl_ns64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshl_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshl_ns64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x1_t arg0_int64x1_t;
--
--  out_int64x1_t = vqshl_n_s64 (arg0_int64x1_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshl\.s64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshl_ns8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshl_ns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshl_ns8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_int8x8_t = vqshl_n_s8 (arg0_int8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshl\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshl_nu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshl_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshl_nu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_uint16x4_t = vqshl_n_u16 (arg0_uint16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshl\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshl_nu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshl_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshl_nu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_uint32x2_t = vqshl_n_u32 (arg0_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshl\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshl_nu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshl_nu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshl_nu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--
--  out_uint64x1_t = vqshl_n_u64 (arg0_uint64x1_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshl\.u64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshl_nu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshl_nu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshl_nu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_uint8x8_t = vqshl_n_u8 (arg0_uint8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshl\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshls16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqshls16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshls16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vqshl_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vqshl\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshls32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqshls32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshls32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vqshl_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vqshl\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshls64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqshls64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshls64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x1_t arg0_int64x1_t;
--  int64x1_t arg1_int64x1_t;
--
--  out_int64x1_t = vqshl_s64 (arg0_int64x1_t, arg1_int64x1_t);
--}
--
--/* { dg-final { scan-assembler "vqshl\.s64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshls8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqshls8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshls8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vqshl_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vqshl\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqshlu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshlu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_uint16x4_t = vqshl_u16 (arg0_uint16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vqshl\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqshlu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshlu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_uint32x2_t = vqshl_u32 (arg0_uint32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vqshl\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqshlu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshlu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--  int64x1_t arg1_int64x1_t;
--
--  out_uint64x1_t = vqshl_u64 (arg0_uint64x1_t, arg1_int64x1_t);
--}
--
--/* { dg-final { scan-assembler "vqshl\.u64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqshlu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshlu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_uint8x8_t = vqshl_u8 (arg0_uint8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vqshl\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshluQ_ns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshluQ_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshluQ_ns16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_uint16x8_t = vqshluq_n_s16 (arg0_int16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshlu\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshluQ_ns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshluQ_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshluQ_ns32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_uint32x4_t = vqshluq_n_s32 (arg0_int32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshlu\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshluQ_ns64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshluQ_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshluQ_ns64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_uint64x2_t = vqshluq_n_s64 (arg0_int64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshlu\.s64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshluQ_ns8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshluQ_ns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshluQ_ns8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_uint8x16_t = vqshluq_n_s8 (arg0_int8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshlu\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlu_ns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshlu_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshlu_ns16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_uint16x4_t = vqshlu_n_s16 (arg0_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshlu\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlu_ns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshlu_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshlu_ns32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_uint32x2_t = vqshlu_n_s32 (arg0_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshlu\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlu_ns64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshlu_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshlu_ns64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  int64x1_t arg0_int64x1_t;
--
--  out_uint64x1_t = vqshlu_n_s64 (arg0_int64x1_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshlu\.s64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshlu_ns8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshlu_ns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshlu_ns8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_uint8x8_t = vqshlu_n_s8 (arg0_int8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshlu\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshrn_ns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshrn_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshrn_ns16 (void)
--{
--  int8x8_t out_int8x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int8x8_t = vqshrn_n_s16 (arg0_int16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshrn\.s16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshrn_ns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshrn_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshrn_ns32 (void)
--{
--  int16x4_t out_int16x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_int16x4_t = vqshrn_n_s32 (arg0_int32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshrn\.s32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshrn_ns64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshrn_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshrn_ns64 (void)
--{
--  int32x2_t out_int32x2_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_int32x2_t = vqshrn_n_s64 (arg0_int64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshrn\.s64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshrn_nu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshrn_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshrn_nu16 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_uint8x8_t = vqshrn_n_u16 (arg0_uint16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshrn\.u16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshrn_nu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshrn_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshrn_nu32 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_uint16x4_t = vqshrn_n_u32 (arg0_uint32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshrn\.u32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshrn_nu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshrn_nu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshrn_nu64 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint64x2_t arg0_uint64x2_t;
--
--  out_uint32x2_t = vqshrn_n_u64 (arg0_uint64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshrn\.u64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshrun_ns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshrun_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshrun_ns16 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_uint8x8_t = vqshrun_n_s16 (arg0_int16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshrun\.s16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshrun_ns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshrun_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshrun_ns32 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_uint16x4_t = vqshrun_n_s32 (arg0_int32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshrun\.s32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqshrun_ns64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vqshrun_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqshrun_ns64 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_uint32x2_t = vqshrun_n_s64 (arg0_int64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vqshrun\.s64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqsubQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqsubQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vqsubq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vqsub\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqsubQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqsubQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vqsubq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vqsub\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubQs64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqsubQs64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqsubQs64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_int64x2_t = vqsubq_s64 (arg0_int64x2_t, arg1_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "vqsub\.s64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqsubQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqsubQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = vqsubq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vqsub\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqsubQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqsubQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vqsubq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vqsub\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqsubQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqsubQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vqsubq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vqsub\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubQu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqsubQu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqsubQu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint64x2_t arg1_uint64x2_t;
--
--  out_uint64x2_t = vqsubq_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
--}
--
--/* { dg-final { scan-assembler "vqsub\.u64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqsubQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqsubQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vqsubq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vqsub\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqsubs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqsubs16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vqsub_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vqsub\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqsubs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqsubs32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vqsub_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vqsub\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubs64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqsubs64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqsubs64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x1_t arg0_int64x1_t;
--  int64x1_t arg1_int64x1_t;
--
--  out_int64x1_t = vqsub_s64 (arg0_int64x1_t, arg1_int64x1_t);
--}
--
--/* { dg-final { scan-assembler "vqsub\.s64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqsubs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqsubs8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vqsub_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vqsub\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqsubu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqsubu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vqsub_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vqsub\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqsubu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqsubu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vqsub_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vqsub\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqsubu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqsubu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--  uint64x1_t arg1_uint64x1_t;
--
--  out_uint64x1_t = vqsub_u64 (arg0_uint64x1_t, arg1_uint64x1_t);
--}
--
--/* { dg-final { scan-assembler "vqsub\.u64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vqsubu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vqsubu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vqsubu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vqsub_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vqsub\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrecpeQf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrecpeQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrecpeQf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_float32x4_t = vrecpeq_f32 (arg0_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vrecpe\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrecpeQu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrecpeQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrecpeQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_uint32x4_t = vrecpeq_u32 (arg0_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vrecpe\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrecpef32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrecpef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrecpef32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_float32x2_t = vrecpe_f32 (arg0_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vrecpe\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrecpeu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrecpeu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrecpeu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_uint32x2_t = vrecpe_u32 (arg0_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vrecpe\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrecpsQf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vrecpsQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrecpsQf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--
--  out_float32x4_t = vrecpsq_f32 (arg0_float32x4_t, arg1_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vrecps\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrecpsf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vrecpsf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrecpsf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_float32x2_t = vrecps_f32 (arg0_float32x2_t, arg1_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vrecps\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQf32_p128.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQf32_p128' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQf32_p128 (void)
--{
--  float32x4_t out_float32x4_t;
--  poly128_t arg0_poly128_t;
--
--  out_float32x4_t = vreinterpretq_f32_p128 (arg0_poly128_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQf32_p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQf32_p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQf32_p16 (void)
--{
--  float32x4_t out_float32x4_t;
--  poly16x8_t arg0_poly16x8_t;
--
--  out_float32x4_t = vreinterpretq_f32_p16 (arg0_poly16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQf32_p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQf32_p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQf32_p64 (void)
--{
--  float32x4_t out_float32x4_t;
--  poly64x2_t arg0_poly64x2_t;
--
--  out_float32x4_t = vreinterpretq_f32_p64 (arg0_poly64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQf32_p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQf32_p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQf32_p8 (void)
--{
--  float32x4_t out_float32x4_t;
--  poly8x16_t arg0_poly8x16_t;
--
--  out_float32x4_t = vreinterpretq_f32_p8 (arg0_poly8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQf32_s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQf32_s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQf32_s16 (void)
--{
--  float32x4_t out_float32x4_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_float32x4_t = vreinterpretq_f32_s16 (arg0_int16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQf32_s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQf32_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQf32_s32 (void)
--{
--  float32x4_t out_float32x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_float32x4_t = vreinterpretq_f32_s32 (arg0_int32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQf32_s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQf32_s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQf32_s64 (void)
--{
--  float32x4_t out_float32x4_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_float32x4_t = vreinterpretq_f32_s64 (arg0_int64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQf32_s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQf32_s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQf32_s8 (void)
--{
--  float32x4_t out_float32x4_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_float32x4_t = vreinterpretq_f32_s8 (arg0_int8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQf32_u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQf32_u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQf32_u16 (void)
--{
--  float32x4_t out_float32x4_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_float32x4_t = vreinterpretq_f32_u16 (arg0_uint16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQf32_u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQf32_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQf32_u32 (void)
--{
--  float32x4_t out_float32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_float32x4_t = vreinterpretq_f32_u32 (arg0_uint32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQf32_u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQf32_u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQf32_u64 (void)
--{
--  float32x4_t out_float32x4_t;
--  uint64x2_t arg0_uint64x2_t;
--
--  out_float32x4_t = vreinterpretq_f32_u64 (arg0_uint64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQf32_u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQf32_u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQf32_u8 (void)
--{
--  float32x4_t out_float32x4_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_float32x4_t = vreinterpretq_f32_u8 (arg0_uint8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp128_f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp128_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp128_f32 (void)
--{
--  poly128_t out_poly128_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_poly128_t = vreinterpretq_p128_f32 (arg0_float32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp128_p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp128_p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp128_p16 (void)
--{
--  poly128_t out_poly128_t;
--  poly16x8_t arg0_poly16x8_t;
--
--  out_poly128_t = vreinterpretq_p128_p16 (arg0_poly16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp128_p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp128_p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp128_p64 (void)
--{
--  poly128_t out_poly128_t;
--  poly64x2_t arg0_poly64x2_t;
--
--  out_poly128_t = vreinterpretq_p128_p64 (arg0_poly64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp128_p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp128_p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp128_p8 (void)
--{
--  poly128_t out_poly128_t;
--  poly8x16_t arg0_poly8x16_t;
--
--  out_poly128_t = vreinterpretq_p128_p8 (arg0_poly8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp128_s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp128_s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp128_s16 (void)
--{
--  poly128_t out_poly128_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_poly128_t = vreinterpretq_p128_s16 (arg0_int16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp128_s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp128_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp128_s32 (void)
--{
--  poly128_t out_poly128_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_poly128_t = vreinterpretq_p128_s32 (arg0_int32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp128_s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp128_s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp128_s64 (void)
--{
--  poly128_t out_poly128_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_poly128_t = vreinterpretq_p128_s64 (arg0_int64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp128_s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp128_s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp128_s8 (void)
--{
--  poly128_t out_poly128_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_poly128_t = vreinterpretq_p128_s8 (arg0_int8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp128_u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp128_u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp128_u16 (void)
--{
--  poly128_t out_poly128_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_poly128_t = vreinterpretq_p128_u16 (arg0_uint16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp128_u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp128_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp128_u32 (void)
--{
--  poly128_t out_poly128_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_poly128_t = vreinterpretq_p128_u32 (arg0_uint32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp128_u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp128_u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp128_u64 (void)
--{
--  poly128_t out_poly128_t;
--  uint64x2_t arg0_uint64x2_t;
--
--  out_poly128_t = vreinterpretq_p128_u64 (arg0_uint64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp128_u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp128_u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp128_u8 (void)
--{
--  poly128_t out_poly128_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_poly128_t = vreinterpretq_p128_u8 (arg0_uint8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp16_f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp16_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp16_f32 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_poly16x8_t = vreinterpretq_p16_f32 (arg0_float32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp16_p128.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp16_p128' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp16_p128 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  poly128_t arg0_poly128_t;
--
--  out_poly16x8_t = vreinterpretq_p16_p128 (arg0_poly128_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp16_p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp16_p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp16_p64 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  poly64x2_t arg0_poly64x2_t;
--
--  out_poly16x8_t = vreinterpretq_p16_p64 (arg0_poly64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp16_p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp16_p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp16_p8 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  poly8x16_t arg0_poly8x16_t;
--
--  out_poly16x8_t = vreinterpretq_p16_p8 (arg0_poly8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp16_s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp16_s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp16_s16 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_poly16x8_t = vreinterpretq_p16_s16 (arg0_int16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp16_s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp16_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp16_s32 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_poly16x8_t = vreinterpretq_p16_s32 (arg0_int32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp16_s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp16_s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp16_s64 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_poly16x8_t = vreinterpretq_p16_s64 (arg0_int64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp16_s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp16_s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp16_s8 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_poly16x8_t = vreinterpretq_p16_s8 (arg0_int8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp16_u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp16_u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp16_u16 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_poly16x8_t = vreinterpretq_p16_u16 (arg0_uint16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp16_u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp16_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp16_u32 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_poly16x8_t = vreinterpretq_p16_u32 (arg0_uint32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp16_u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp16_u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp16_u64 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  uint64x2_t arg0_uint64x2_t;
--
--  out_poly16x8_t = vreinterpretq_p16_u64 (arg0_uint64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp16_u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp16_u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp16_u8 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_poly16x8_t = vreinterpretq_p16_u8 (arg0_uint8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp64_f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp64_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp64_f32 (void)
--{
--  poly64x2_t out_poly64x2_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_poly64x2_t = vreinterpretq_p64_f32 (arg0_float32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp64_p128.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp64_p128' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp64_p128 (void)
--{
--  poly64x2_t out_poly64x2_t;
--  poly128_t arg0_poly128_t;
--
--  out_poly64x2_t = vreinterpretq_p64_p128 (arg0_poly128_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp64_p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp64_p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp64_p16 (void)
--{
--  poly64x2_t out_poly64x2_t;
--  poly16x8_t arg0_poly16x8_t;
--
--  out_poly64x2_t = vreinterpretq_p64_p16 (arg0_poly16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp64_p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp64_p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp64_p8 (void)
--{
--  poly64x2_t out_poly64x2_t;
--  poly8x16_t arg0_poly8x16_t;
--
--  out_poly64x2_t = vreinterpretq_p64_p8 (arg0_poly8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp64_s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp64_s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp64_s16 (void)
--{
--  poly64x2_t out_poly64x2_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_poly64x2_t = vreinterpretq_p64_s16 (arg0_int16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp64_s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp64_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp64_s32 (void)
--{
--  poly64x2_t out_poly64x2_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_poly64x2_t = vreinterpretq_p64_s32 (arg0_int32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp64_s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp64_s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp64_s64 (void)
--{
--  poly64x2_t out_poly64x2_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_poly64x2_t = vreinterpretq_p64_s64 (arg0_int64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp64_s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp64_s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp64_s8 (void)
--{
--  poly64x2_t out_poly64x2_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_poly64x2_t = vreinterpretq_p64_s8 (arg0_int8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp64_u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp64_u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp64_u16 (void)
--{
--  poly64x2_t out_poly64x2_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_poly64x2_t = vreinterpretq_p64_u16 (arg0_uint16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp64_u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp64_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp64_u32 (void)
--{
--  poly64x2_t out_poly64x2_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_poly64x2_t = vreinterpretq_p64_u32 (arg0_uint32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp64_u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp64_u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp64_u64 (void)
--{
--  poly64x2_t out_poly64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--
--  out_poly64x2_t = vreinterpretq_p64_u64 (arg0_uint64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp64_u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp64_u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp64_u8 (void)
--{
--  poly64x2_t out_poly64x2_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_poly64x2_t = vreinterpretq_p64_u8 (arg0_uint8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp8_f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp8_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp8_f32 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_poly8x16_t = vreinterpretq_p8_f32 (arg0_float32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp8_p128.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp8_p128' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp8_p128 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  poly128_t arg0_poly128_t;
--
--  out_poly8x16_t = vreinterpretq_p8_p128 (arg0_poly128_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp8_p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp8_p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp8_p16 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  poly16x8_t arg0_poly16x8_t;
--
--  out_poly8x16_t = vreinterpretq_p8_p16 (arg0_poly16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp8_p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp8_p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp8_p64 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  poly64x2_t arg0_poly64x2_t;
--
--  out_poly8x16_t = vreinterpretq_p8_p64 (arg0_poly64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp8_s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp8_s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp8_s16 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_poly8x16_t = vreinterpretq_p8_s16 (arg0_int16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp8_s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp8_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp8_s32 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_poly8x16_t = vreinterpretq_p8_s32 (arg0_int32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp8_s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp8_s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp8_s64 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_poly8x16_t = vreinterpretq_p8_s64 (arg0_int64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp8_s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp8_s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp8_s8 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_poly8x16_t = vreinterpretq_p8_s8 (arg0_int8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp8_u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp8_u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp8_u16 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_poly8x16_t = vreinterpretq_p8_u16 (arg0_uint16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp8_u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp8_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp8_u32 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_poly8x16_t = vreinterpretq_p8_u32 (arg0_uint32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp8_u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp8_u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp8_u64 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  uint64x2_t arg0_uint64x2_t;
--
--  out_poly8x16_t = vreinterpretq_p8_u64 (arg0_uint64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQp8_u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQp8_u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQp8_u8 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_poly8x16_t = vreinterpretq_p8_u8 (arg0_uint8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs16_f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs16_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs16_f32 (void)
--{
--  int16x8_t out_int16x8_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_int16x8_t = vreinterpretq_s16_f32 (arg0_float32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs16_p128.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs16_p128' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs16_p128 (void)
--{
--  int16x8_t out_int16x8_t;
--  poly128_t arg0_poly128_t;
--
--  out_int16x8_t = vreinterpretq_s16_p128 (arg0_poly128_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs16_p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs16_p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs16_p16 (void)
--{
--  int16x8_t out_int16x8_t;
--  poly16x8_t arg0_poly16x8_t;
--
--  out_int16x8_t = vreinterpretq_s16_p16 (arg0_poly16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs16_p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs16_p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs16_p64 (void)
--{
--  int16x8_t out_int16x8_t;
--  poly64x2_t arg0_poly64x2_t;
--
--  out_int16x8_t = vreinterpretq_s16_p64 (arg0_poly64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs16_p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs16_p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs16_p8 (void)
--{
--  int16x8_t out_int16x8_t;
--  poly8x16_t arg0_poly8x16_t;
--
--  out_int16x8_t = vreinterpretq_s16_p8 (arg0_poly8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs16_s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs16_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs16_s32 (void)
--{
--  int16x8_t out_int16x8_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_int16x8_t = vreinterpretq_s16_s32 (arg0_int32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs16_s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs16_s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs16_s64 (void)
--{
--  int16x8_t out_int16x8_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_int16x8_t = vreinterpretq_s16_s64 (arg0_int64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs16_s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs16_s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs16_s8 (void)
--{
--  int16x8_t out_int16x8_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_int16x8_t = vreinterpretq_s16_s8 (arg0_int8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs16_u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs16_u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs16_u16 (void)
--{
--  int16x8_t out_int16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_int16x8_t = vreinterpretq_s16_u16 (arg0_uint16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs16_u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs16_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs16_u32 (void)
--{
--  int16x8_t out_int16x8_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_int16x8_t = vreinterpretq_s16_u32 (arg0_uint32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs16_u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs16_u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs16_u64 (void)
--{
--  int16x8_t out_int16x8_t;
--  uint64x2_t arg0_uint64x2_t;
--
--  out_int16x8_t = vreinterpretq_s16_u64 (arg0_uint64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs16_u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs16_u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs16_u8 (void)
--{
--  int16x8_t out_int16x8_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_int16x8_t = vreinterpretq_s16_u8 (arg0_uint8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs32_f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs32_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs32_f32 (void)
--{
--  int32x4_t out_int32x4_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_int32x4_t = vreinterpretq_s32_f32 (arg0_float32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs32_p128.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs32_p128' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs32_p128 (void)
--{
--  int32x4_t out_int32x4_t;
--  poly128_t arg0_poly128_t;
--
--  out_int32x4_t = vreinterpretq_s32_p128 (arg0_poly128_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs32_p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs32_p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs32_p16 (void)
--{
--  int32x4_t out_int32x4_t;
--  poly16x8_t arg0_poly16x8_t;
--
--  out_int32x4_t = vreinterpretq_s32_p16 (arg0_poly16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs32_p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs32_p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs32_p64 (void)
--{
--  int32x4_t out_int32x4_t;
--  poly64x2_t arg0_poly64x2_t;
--
--  out_int32x4_t = vreinterpretq_s32_p64 (arg0_poly64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs32_p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs32_p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs32_p8 (void)
--{
--  int32x4_t out_int32x4_t;
--  poly8x16_t arg0_poly8x16_t;
--
--  out_int32x4_t = vreinterpretq_s32_p8 (arg0_poly8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs32_s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs32_s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs32_s16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int32x4_t = vreinterpretq_s32_s16 (arg0_int16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs32_s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs32_s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs32_s64 (void)
--{
--  int32x4_t out_int32x4_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_int32x4_t = vreinterpretq_s32_s64 (arg0_int64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs32_s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs32_s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs32_s8 (void)
--{
--  int32x4_t out_int32x4_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_int32x4_t = vreinterpretq_s32_s8 (arg0_int8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs32_u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs32_u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs32_u16 (void)
--{
--  int32x4_t out_int32x4_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_int32x4_t = vreinterpretq_s32_u16 (arg0_uint16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs32_u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs32_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs32_u32 (void)
--{
--  int32x4_t out_int32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_int32x4_t = vreinterpretq_s32_u32 (arg0_uint32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs32_u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs32_u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs32_u64 (void)
--{
--  int32x4_t out_int32x4_t;
--  uint64x2_t arg0_uint64x2_t;
--
--  out_int32x4_t = vreinterpretq_s32_u64 (arg0_uint64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs32_u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs32_u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs32_u8 (void)
--{
--  int32x4_t out_int32x4_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_int32x4_t = vreinterpretq_s32_u8 (arg0_uint8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs64_f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs64_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs64_f32 (void)
--{
--  int64x2_t out_int64x2_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_int64x2_t = vreinterpretq_s64_f32 (arg0_float32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs64_p128.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs64_p128' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs64_p128 (void)
--{
--  int64x2_t out_int64x2_t;
--  poly128_t arg0_poly128_t;
--
--  out_int64x2_t = vreinterpretq_s64_p128 (arg0_poly128_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs64_p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs64_p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs64_p16 (void)
--{
--  int64x2_t out_int64x2_t;
--  poly16x8_t arg0_poly16x8_t;
--
--  out_int64x2_t = vreinterpretq_s64_p16 (arg0_poly16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs64_p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs64_p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs64_p64 (void)
--{
--  int64x2_t out_int64x2_t;
--  poly64x2_t arg0_poly64x2_t;
--
--  out_int64x2_t = vreinterpretq_s64_p64 (arg0_poly64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs64_p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs64_p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs64_p8 (void)
--{
--  int64x2_t out_int64x2_t;
--  poly8x16_t arg0_poly8x16_t;
--
--  out_int64x2_t = vreinterpretq_s64_p8 (arg0_poly8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs64_s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs64_s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs64_s16 (void)
--{
--  int64x2_t out_int64x2_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int64x2_t = vreinterpretq_s64_s16 (arg0_int16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs64_s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs64_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs64_s32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_int64x2_t = vreinterpretq_s64_s32 (arg0_int32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs64_s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs64_s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs64_s8 (void)
--{
--  int64x2_t out_int64x2_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_int64x2_t = vreinterpretq_s64_s8 (arg0_int8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs64_u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs64_u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs64_u16 (void)
--{
--  int64x2_t out_int64x2_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_int64x2_t = vreinterpretq_s64_u16 (arg0_uint16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs64_u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs64_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs64_u32 (void)
--{
--  int64x2_t out_int64x2_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_int64x2_t = vreinterpretq_s64_u32 (arg0_uint32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs64_u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs64_u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs64_u64 (void)
--{
--  int64x2_t out_int64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--
--  out_int64x2_t = vreinterpretq_s64_u64 (arg0_uint64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs64_u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs64_u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs64_u8 (void)
--{
--  int64x2_t out_int64x2_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_int64x2_t = vreinterpretq_s64_u8 (arg0_uint8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs8_f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs8_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs8_f32 (void)
--{
--  int8x16_t out_int8x16_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_int8x16_t = vreinterpretq_s8_f32 (arg0_float32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs8_p128.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs8_p128' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs8_p128 (void)
--{
--  int8x16_t out_int8x16_t;
--  poly128_t arg0_poly128_t;
--
--  out_int8x16_t = vreinterpretq_s8_p128 (arg0_poly128_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs8_p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs8_p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs8_p16 (void)
--{
--  int8x16_t out_int8x16_t;
--  poly16x8_t arg0_poly16x8_t;
--
--  out_int8x16_t = vreinterpretq_s8_p16 (arg0_poly16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs8_p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs8_p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs8_p64 (void)
--{
--  int8x16_t out_int8x16_t;
--  poly64x2_t arg0_poly64x2_t;
--
--  out_int8x16_t = vreinterpretq_s8_p64 (arg0_poly64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs8_p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs8_p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs8_p8 (void)
--{
--  int8x16_t out_int8x16_t;
--  poly8x16_t arg0_poly8x16_t;
--
--  out_int8x16_t = vreinterpretq_s8_p8 (arg0_poly8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs8_s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs8_s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs8_s16 (void)
--{
--  int8x16_t out_int8x16_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int8x16_t = vreinterpretq_s8_s16 (arg0_int16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs8_s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs8_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs8_s32 (void)
--{
--  int8x16_t out_int8x16_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_int8x16_t = vreinterpretq_s8_s32 (arg0_int32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs8_s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs8_s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs8_s64 (void)
--{
--  int8x16_t out_int8x16_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_int8x16_t = vreinterpretq_s8_s64 (arg0_int64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs8_u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs8_u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs8_u16 (void)
--{
--  int8x16_t out_int8x16_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_int8x16_t = vreinterpretq_s8_u16 (arg0_uint16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs8_u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs8_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs8_u32 (void)
--{
--  int8x16_t out_int8x16_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_int8x16_t = vreinterpretq_s8_u32 (arg0_uint32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs8_u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs8_u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs8_u64 (void)
--{
--  int8x16_t out_int8x16_t;
--  uint64x2_t arg0_uint64x2_t;
--
--  out_int8x16_t = vreinterpretq_s8_u64 (arg0_uint64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQs8_u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQs8_u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQs8_u8 (void)
--{
--  int8x16_t out_int8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_int8x16_t = vreinterpretq_s8_u8 (arg0_uint8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu16_f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu16_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu16_f32 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_uint16x8_t = vreinterpretq_u16_f32 (arg0_float32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu16_p128.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu16_p128' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu16_p128 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  poly128_t arg0_poly128_t;
--
--  out_uint16x8_t = vreinterpretq_u16_p128 (arg0_poly128_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu16_p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu16_p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu16_p16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  poly16x8_t arg0_poly16x8_t;
--
--  out_uint16x8_t = vreinterpretq_u16_p16 (arg0_poly16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu16_p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu16_p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu16_p64 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  poly64x2_t arg0_poly64x2_t;
--
--  out_uint16x8_t = vreinterpretq_u16_p64 (arg0_poly64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu16_p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu16_p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu16_p8 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  poly8x16_t arg0_poly8x16_t;
--
--  out_uint16x8_t = vreinterpretq_u16_p8 (arg0_poly8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu16_s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu16_s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu16_s16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_uint16x8_t = vreinterpretq_u16_s16 (arg0_int16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu16_s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu16_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu16_s32 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_uint16x8_t = vreinterpretq_u16_s32 (arg0_int32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu16_s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu16_s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu16_s64 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_uint16x8_t = vreinterpretq_u16_s64 (arg0_int64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu16_s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu16_s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu16_s8 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_uint16x8_t = vreinterpretq_u16_s8 (arg0_int8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu16_u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu16_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu16_u32 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_uint16x8_t = vreinterpretq_u16_u32 (arg0_uint32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu16_u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu16_u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu16_u64 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint64x2_t arg0_uint64x2_t;
--
--  out_uint16x8_t = vreinterpretq_u16_u64 (arg0_uint64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu16_u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu16_u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu16_u8 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_uint16x8_t = vreinterpretq_u16_u8 (arg0_uint8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu32_f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu32_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu32_f32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_uint32x4_t = vreinterpretq_u32_f32 (arg0_float32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu32_p128.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu32_p128' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu32_p128 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  poly128_t arg0_poly128_t;
--
--  out_uint32x4_t = vreinterpretq_u32_p128 (arg0_poly128_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu32_p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu32_p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu32_p16 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  poly16x8_t arg0_poly16x8_t;
--
--  out_uint32x4_t = vreinterpretq_u32_p16 (arg0_poly16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu32_p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu32_p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu32_p64 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  poly64x2_t arg0_poly64x2_t;
--
--  out_uint32x4_t = vreinterpretq_u32_p64 (arg0_poly64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu32_p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu32_p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu32_p8 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  poly8x16_t arg0_poly8x16_t;
--
--  out_uint32x4_t = vreinterpretq_u32_p8 (arg0_poly8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu32_s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu32_s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu32_s16 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_uint32x4_t = vreinterpretq_u32_s16 (arg0_int16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu32_s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu32_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu32_s32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_uint32x4_t = vreinterpretq_u32_s32 (arg0_int32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu32_s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu32_s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu32_s64 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_uint32x4_t = vreinterpretq_u32_s64 (arg0_int64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu32_s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu32_s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu32_s8 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_uint32x4_t = vreinterpretq_u32_s8 (arg0_int8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu32_u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu32_u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu32_u16 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_uint32x4_t = vreinterpretq_u32_u16 (arg0_uint16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu32_u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu32_u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu32_u64 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint64x2_t arg0_uint64x2_t;
--
--  out_uint32x4_t = vreinterpretq_u32_u64 (arg0_uint64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu32_u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu32_u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu32_u8 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_uint32x4_t = vreinterpretq_u32_u8 (arg0_uint8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu64_f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu64_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu64_f32 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_uint64x2_t = vreinterpretq_u64_f32 (arg0_float32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu64_p128.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu64_p128' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu64_p128 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  poly128_t arg0_poly128_t;
--
--  out_uint64x2_t = vreinterpretq_u64_p128 (arg0_poly128_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu64_p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu64_p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu64_p16 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  poly16x8_t arg0_poly16x8_t;
--
--  out_uint64x2_t = vreinterpretq_u64_p16 (arg0_poly16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu64_p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu64_p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu64_p64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  poly64x2_t arg0_poly64x2_t;
--
--  out_uint64x2_t = vreinterpretq_u64_p64 (arg0_poly64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu64_p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu64_p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu64_p8 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  poly8x16_t arg0_poly8x16_t;
--
--  out_uint64x2_t = vreinterpretq_u64_p8 (arg0_poly8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu64_s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu64_s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu64_s16 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_uint64x2_t = vreinterpretq_u64_s16 (arg0_int16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu64_s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu64_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu64_s32 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_uint64x2_t = vreinterpretq_u64_s32 (arg0_int32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu64_s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu64_s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu64_s64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_uint64x2_t = vreinterpretq_u64_s64 (arg0_int64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu64_s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu64_s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu64_s8 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_uint64x2_t = vreinterpretq_u64_s8 (arg0_int8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu64_u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu64_u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu64_u16 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_uint64x2_t = vreinterpretq_u64_u16 (arg0_uint16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu64_u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu64_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu64_u32 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_uint64x2_t = vreinterpretq_u64_u32 (arg0_uint32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu64_u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu64_u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu64_u8 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_uint64x2_t = vreinterpretq_u64_u8 (arg0_uint8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu8_f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu8_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu8_f32 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_uint8x16_t = vreinterpretq_u8_f32 (arg0_float32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu8_p128.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu8_p128' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu8_p128 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  poly128_t arg0_poly128_t;
--
--  out_uint8x16_t = vreinterpretq_u8_p128 (arg0_poly128_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu8_p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu8_p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu8_p16 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  poly16x8_t arg0_poly16x8_t;
--
--  out_uint8x16_t = vreinterpretq_u8_p16 (arg0_poly16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu8_p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu8_p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu8_p64 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  poly64x2_t arg0_poly64x2_t;
--
--  out_uint8x16_t = vreinterpretq_u8_p64 (arg0_poly64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu8_p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu8_p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu8_p8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  poly8x16_t arg0_poly8x16_t;
--
--  out_uint8x16_t = vreinterpretq_u8_p8 (arg0_poly8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu8_s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu8_s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu8_s16 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_uint8x16_t = vreinterpretq_u8_s16 (arg0_int16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu8_s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu8_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu8_s32 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_uint8x16_t = vreinterpretq_u8_s32 (arg0_int32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu8_s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu8_s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu8_s64 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_uint8x16_t = vreinterpretq_u8_s64 (arg0_int64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu8_s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu8_s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu8_s8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_uint8x16_t = vreinterpretq_u8_s8 (arg0_int8x16_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu8_u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu8_u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu8_u16 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_uint8x16_t = vreinterpretq_u8_u16 (arg0_uint16x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu8_u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu8_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu8_u32 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_uint8x16_t = vreinterpretq_u8_u32 (arg0_uint32x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretQu8_u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretQu8_u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretQu8_u64 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint64x2_t arg0_uint64x2_t;
--
--  out_uint8x16_t = vreinterpretq_u8_u64 (arg0_uint64x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretf32_p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretf32_p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretf32_p16 (void)
--{
--  float32x2_t out_float32x2_t;
--  poly16x4_t arg0_poly16x4_t;
--
--  out_float32x2_t = vreinterpret_f32_p16 (arg0_poly16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretf32_p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretf32_p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretf32_p64 (void)
--{
--  float32x2_t out_float32x2_t;
--  poly64x1_t arg0_poly64x1_t;
--
--  out_float32x2_t = vreinterpret_f32_p64 (arg0_poly64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretf32_p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretf32_p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretf32_p8 (void)
--{
--  float32x2_t out_float32x2_t;
--  poly8x8_t arg0_poly8x8_t;
--
--  out_float32x2_t = vreinterpret_f32_p8 (arg0_poly8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretf32_s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretf32_s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretf32_s16 (void)
--{
--  float32x2_t out_float32x2_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_float32x2_t = vreinterpret_f32_s16 (arg0_int16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretf32_s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretf32_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretf32_s32 (void)
--{
--  float32x2_t out_float32x2_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_float32x2_t = vreinterpret_f32_s32 (arg0_int32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretf32_s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretf32_s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretf32_s64 (void)
--{
--  float32x2_t out_float32x2_t;
--  int64x1_t arg0_int64x1_t;
--
--  out_float32x2_t = vreinterpret_f32_s64 (arg0_int64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretf32_s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretf32_s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretf32_s8 (void)
--{
--  float32x2_t out_float32x2_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_float32x2_t = vreinterpret_f32_s8 (arg0_int8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretf32_u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretf32_u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretf32_u16 (void)
--{
--  float32x2_t out_float32x2_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_float32x2_t = vreinterpret_f32_u16 (arg0_uint16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretf32_u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretf32_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretf32_u32 (void)
--{
--  float32x2_t out_float32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_float32x2_t = vreinterpret_f32_u32 (arg0_uint32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretf32_u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretf32_u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretf32_u64 (void)
--{
--  float32x2_t out_float32x2_t;
--  uint64x1_t arg0_uint64x1_t;
--
--  out_float32x2_t = vreinterpret_f32_u64 (arg0_uint64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretf32_u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretf32_u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretf32_u8 (void)
--{
--  float32x2_t out_float32x2_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_float32x2_t = vreinterpret_f32_u8 (arg0_uint8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp16_f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp16_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp16_f32 (void)
--{
--  poly16x4_t out_poly16x4_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_poly16x4_t = vreinterpret_p16_f32 (arg0_float32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp16_p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp16_p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp16_p64 (void)
--{
--  poly16x4_t out_poly16x4_t;
--  poly64x1_t arg0_poly64x1_t;
--
--  out_poly16x4_t = vreinterpret_p16_p64 (arg0_poly64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp16_p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp16_p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp16_p8 (void)
--{
--  poly16x4_t out_poly16x4_t;
--  poly8x8_t arg0_poly8x8_t;
--
--  out_poly16x4_t = vreinterpret_p16_p8 (arg0_poly8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp16_s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp16_s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp16_s16 (void)
--{
--  poly16x4_t out_poly16x4_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_poly16x4_t = vreinterpret_p16_s16 (arg0_int16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp16_s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp16_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp16_s32 (void)
--{
--  poly16x4_t out_poly16x4_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_poly16x4_t = vreinterpret_p16_s32 (arg0_int32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp16_s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp16_s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp16_s64 (void)
--{
--  poly16x4_t out_poly16x4_t;
--  int64x1_t arg0_int64x1_t;
--
--  out_poly16x4_t = vreinterpret_p16_s64 (arg0_int64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp16_s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp16_s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp16_s8 (void)
--{
--  poly16x4_t out_poly16x4_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_poly16x4_t = vreinterpret_p16_s8 (arg0_int8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp16_u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp16_u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp16_u16 (void)
--{
--  poly16x4_t out_poly16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_poly16x4_t = vreinterpret_p16_u16 (arg0_uint16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp16_u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp16_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp16_u32 (void)
--{
--  poly16x4_t out_poly16x4_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_poly16x4_t = vreinterpret_p16_u32 (arg0_uint32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp16_u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp16_u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp16_u64 (void)
--{
--  poly16x4_t out_poly16x4_t;
--  uint64x1_t arg0_uint64x1_t;
--
--  out_poly16x4_t = vreinterpret_p16_u64 (arg0_uint64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp16_u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp16_u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp16_u8 (void)
--{
--  poly16x4_t out_poly16x4_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_poly16x4_t = vreinterpret_p16_u8 (arg0_uint8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp64_f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp64_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp64_f32 (void)
--{
--  poly64x1_t out_poly64x1_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_poly64x1_t = vreinterpret_p64_f32 (arg0_float32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp64_p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp64_p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp64_p16 (void)
--{
--  poly64x1_t out_poly64x1_t;
--  poly16x4_t arg0_poly16x4_t;
--
--  out_poly64x1_t = vreinterpret_p64_p16 (arg0_poly16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp64_p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp64_p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp64_p8 (void)
--{
--  poly64x1_t out_poly64x1_t;
--  poly8x8_t arg0_poly8x8_t;
--
--  out_poly64x1_t = vreinterpret_p64_p8 (arg0_poly8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp64_s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp64_s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp64_s16 (void)
--{
--  poly64x1_t out_poly64x1_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_poly64x1_t = vreinterpret_p64_s16 (arg0_int16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp64_s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp64_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp64_s32 (void)
--{
--  poly64x1_t out_poly64x1_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_poly64x1_t = vreinterpret_p64_s32 (arg0_int32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp64_s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp64_s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp64_s64 (void)
--{
--  poly64x1_t out_poly64x1_t;
--  int64x1_t arg0_int64x1_t;
--
--  out_poly64x1_t = vreinterpret_p64_s64 (arg0_int64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp64_s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp64_s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp64_s8 (void)
--{
--  poly64x1_t out_poly64x1_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_poly64x1_t = vreinterpret_p64_s8 (arg0_int8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp64_u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp64_u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp64_u16 (void)
--{
--  poly64x1_t out_poly64x1_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_poly64x1_t = vreinterpret_p64_u16 (arg0_uint16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp64_u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp64_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp64_u32 (void)
--{
--  poly64x1_t out_poly64x1_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_poly64x1_t = vreinterpret_p64_u32 (arg0_uint32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp64_u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp64_u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp64_u64 (void)
--{
--  poly64x1_t out_poly64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--
--  out_poly64x1_t = vreinterpret_p64_u64 (arg0_uint64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp64_u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp64_u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp64_u8 (void)
--{
--  poly64x1_t out_poly64x1_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_poly64x1_t = vreinterpret_p64_u8 (arg0_uint8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp8_f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp8_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp8_f32 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_poly8x8_t = vreinterpret_p8_f32 (arg0_float32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp8_p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp8_p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp8_p16 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly16x4_t arg0_poly16x4_t;
--
--  out_poly8x8_t = vreinterpret_p8_p16 (arg0_poly16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp8_p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp8_p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp8_p64 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly64x1_t arg0_poly64x1_t;
--
--  out_poly8x8_t = vreinterpret_p8_p64 (arg0_poly64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp8_s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp8_s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp8_s16 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_poly8x8_t = vreinterpret_p8_s16 (arg0_int16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp8_s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp8_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp8_s32 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_poly8x8_t = vreinterpret_p8_s32 (arg0_int32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp8_s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp8_s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp8_s64 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  int64x1_t arg0_int64x1_t;
--
--  out_poly8x8_t = vreinterpret_p8_s64 (arg0_int64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp8_s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp8_s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp8_s8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_poly8x8_t = vreinterpret_p8_s8 (arg0_int8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp8_u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp8_u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp8_u16 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_poly8x8_t = vreinterpret_p8_u16 (arg0_uint16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp8_u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp8_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp8_u32 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_poly8x8_t = vreinterpret_p8_u32 (arg0_uint32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp8_u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp8_u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp8_u64 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  uint64x1_t arg0_uint64x1_t;
--
--  out_poly8x8_t = vreinterpret_p8_u64 (arg0_uint64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretp8_u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretp8_u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretp8_u8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_poly8x8_t = vreinterpret_p8_u8 (arg0_uint8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets16_f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets16_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets16_f32 (void)
--{
--  int16x4_t out_int16x4_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_int16x4_t = vreinterpret_s16_f32 (arg0_float32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets16_p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets16_p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets16_p16 (void)
--{
--  int16x4_t out_int16x4_t;
--  poly16x4_t arg0_poly16x4_t;
--
--  out_int16x4_t = vreinterpret_s16_p16 (arg0_poly16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets16_p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets16_p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets16_p64 (void)
--{
--  int16x4_t out_int16x4_t;
--  poly64x1_t arg0_poly64x1_t;
--
--  out_int16x4_t = vreinterpret_s16_p64 (arg0_poly64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets16_p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets16_p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets16_p8 (void)
--{
--  int16x4_t out_int16x4_t;
--  poly8x8_t arg0_poly8x8_t;
--
--  out_int16x4_t = vreinterpret_s16_p8 (arg0_poly8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets16_s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets16_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets16_s32 (void)
--{
--  int16x4_t out_int16x4_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_int16x4_t = vreinterpret_s16_s32 (arg0_int32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets16_s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets16_s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets16_s64 (void)
--{
--  int16x4_t out_int16x4_t;
--  int64x1_t arg0_int64x1_t;
--
--  out_int16x4_t = vreinterpret_s16_s64 (arg0_int64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets16_s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets16_s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets16_s8 (void)
--{
--  int16x4_t out_int16x4_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_int16x4_t = vreinterpret_s16_s8 (arg0_int8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets16_u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets16_u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets16_u16 (void)
--{
--  int16x4_t out_int16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_int16x4_t = vreinterpret_s16_u16 (arg0_uint16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets16_u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets16_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets16_u32 (void)
--{
--  int16x4_t out_int16x4_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_int16x4_t = vreinterpret_s16_u32 (arg0_uint32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets16_u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets16_u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets16_u64 (void)
--{
--  int16x4_t out_int16x4_t;
--  uint64x1_t arg0_uint64x1_t;
--
--  out_int16x4_t = vreinterpret_s16_u64 (arg0_uint64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets16_u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets16_u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets16_u8 (void)
--{
--  int16x4_t out_int16x4_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_int16x4_t = vreinterpret_s16_u8 (arg0_uint8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets32_f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets32_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets32_f32 (void)
--{
--  int32x2_t out_int32x2_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_int32x2_t = vreinterpret_s32_f32 (arg0_float32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets32_p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets32_p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets32_p16 (void)
--{
--  int32x2_t out_int32x2_t;
--  poly16x4_t arg0_poly16x4_t;
--
--  out_int32x2_t = vreinterpret_s32_p16 (arg0_poly16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets32_p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets32_p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets32_p64 (void)
--{
--  int32x2_t out_int32x2_t;
--  poly64x1_t arg0_poly64x1_t;
--
--  out_int32x2_t = vreinterpret_s32_p64 (arg0_poly64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets32_p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets32_p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets32_p8 (void)
--{
--  int32x2_t out_int32x2_t;
--  poly8x8_t arg0_poly8x8_t;
--
--  out_int32x2_t = vreinterpret_s32_p8 (arg0_poly8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets32_s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets32_s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets32_s16 (void)
--{
--  int32x2_t out_int32x2_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_int32x2_t = vreinterpret_s32_s16 (arg0_int16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets32_s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets32_s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets32_s64 (void)
--{
--  int32x2_t out_int32x2_t;
--  int64x1_t arg0_int64x1_t;
--
--  out_int32x2_t = vreinterpret_s32_s64 (arg0_int64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets32_s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets32_s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets32_s8 (void)
--{
--  int32x2_t out_int32x2_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_int32x2_t = vreinterpret_s32_s8 (arg0_int8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets32_u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets32_u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets32_u16 (void)
--{
--  int32x2_t out_int32x2_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_int32x2_t = vreinterpret_s32_u16 (arg0_uint16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets32_u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets32_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets32_u32 (void)
--{
--  int32x2_t out_int32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_int32x2_t = vreinterpret_s32_u32 (arg0_uint32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets32_u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets32_u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets32_u64 (void)
--{
--  int32x2_t out_int32x2_t;
--  uint64x1_t arg0_uint64x1_t;
--
--  out_int32x2_t = vreinterpret_s32_u64 (arg0_uint64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets32_u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets32_u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets32_u8 (void)
--{
--  int32x2_t out_int32x2_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_int32x2_t = vreinterpret_s32_u8 (arg0_uint8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets64_f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets64_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets64_f32 (void)
--{
--  int64x1_t out_int64x1_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_int64x1_t = vreinterpret_s64_f32 (arg0_float32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets64_p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets64_p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets64_p16 (void)
--{
--  int64x1_t out_int64x1_t;
--  poly16x4_t arg0_poly16x4_t;
--
--  out_int64x1_t = vreinterpret_s64_p16 (arg0_poly16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets64_p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets64_p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets64_p64 (void)
--{
--  int64x1_t out_int64x1_t;
--  poly64x1_t arg0_poly64x1_t;
--
--  out_int64x1_t = vreinterpret_s64_p64 (arg0_poly64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets64_p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets64_p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets64_p8 (void)
--{
--  int64x1_t out_int64x1_t;
--  poly8x8_t arg0_poly8x8_t;
--
--  out_int64x1_t = vreinterpret_s64_p8 (arg0_poly8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets64_s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets64_s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets64_s16 (void)
--{
--  int64x1_t out_int64x1_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_int64x1_t = vreinterpret_s64_s16 (arg0_int16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets64_s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets64_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets64_s32 (void)
--{
--  int64x1_t out_int64x1_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_int64x1_t = vreinterpret_s64_s32 (arg0_int32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets64_s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets64_s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets64_s8 (void)
--{
--  int64x1_t out_int64x1_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_int64x1_t = vreinterpret_s64_s8 (arg0_int8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets64_u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets64_u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets64_u16 (void)
--{
--  int64x1_t out_int64x1_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_int64x1_t = vreinterpret_s64_u16 (arg0_uint16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets64_u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets64_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets64_u32 (void)
--{
--  int64x1_t out_int64x1_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_int64x1_t = vreinterpret_s64_u32 (arg0_uint32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets64_u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets64_u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets64_u64 (void)
--{
--  int64x1_t out_int64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--
--  out_int64x1_t = vreinterpret_s64_u64 (arg0_uint64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets64_u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets64_u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets64_u8 (void)
--{
--  int64x1_t out_int64x1_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_int64x1_t = vreinterpret_s64_u8 (arg0_uint8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets8_f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets8_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets8_f32 (void)
--{
--  int8x8_t out_int8x8_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_int8x8_t = vreinterpret_s8_f32 (arg0_float32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets8_p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets8_p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets8_p16 (void)
--{
--  int8x8_t out_int8x8_t;
--  poly16x4_t arg0_poly16x4_t;
--
--  out_int8x8_t = vreinterpret_s8_p16 (arg0_poly16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets8_p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets8_p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets8_p64 (void)
--{
--  int8x8_t out_int8x8_t;
--  poly64x1_t arg0_poly64x1_t;
--
--  out_int8x8_t = vreinterpret_s8_p64 (arg0_poly64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets8_p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets8_p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets8_p8 (void)
--{
--  int8x8_t out_int8x8_t;
--  poly8x8_t arg0_poly8x8_t;
--
--  out_int8x8_t = vreinterpret_s8_p8 (arg0_poly8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets8_s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets8_s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets8_s16 (void)
--{
--  int8x8_t out_int8x8_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_int8x8_t = vreinterpret_s8_s16 (arg0_int16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets8_s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets8_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets8_s32 (void)
--{
--  int8x8_t out_int8x8_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_int8x8_t = vreinterpret_s8_s32 (arg0_int32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets8_s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets8_s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets8_s64 (void)
--{
--  int8x8_t out_int8x8_t;
--  int64x1_t arg0_int64x1_t;
--
--  out_int8x8_t = vreinterpret_s8_s64 (arg0_int64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets8_u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets8_u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets8_u16 (void)
--{
--  int8x8_t out_int8x8_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_int8x8_t = vreinterpret_s8_u16 (arg0_uint16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets8_u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets8_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets8_u32 (void)
--{
--  int8x8_t out_int8x8_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_int8x8_t = vreinterpret_s8_u32 (arg0_uint32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets8_u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets8_u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets8_u64 (void)
--{
--  int8x8_t out_int8x8_t;
--  uint64x1_t arg0_uint64x1_t;
--
--  out_int8x8_t = vreinterpret_s8_u64 (arg0_uint64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterprets8_u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterprets8_u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterprets8_u8 (void)
--{
--  int8x8_t out_int8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_int8x8_t = vreinterpret_s8_u8 (arg0_uint8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu16_f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu16_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu16_f32 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_uint16x4_t = vreinterpret_u16_f32 (arg0_float32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu16_p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu16_p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu16_p16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  poly16x4_t arg0_poly16x4_t;
--
--  out_uint16x4_t = vreinterpret_u16_p16 (arg0_poly16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu16_p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu16_p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu16_p64 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  poly64x1_t arg0_poly64x1_t;
--
--  out_uint16x4_t = vreinterpret_u16_p64 (arg0_poly64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu16_p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu16_p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu16_p8 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  poly8x8_t arg0_poly8x8_t;
--
--  out_uint16x4_t = vreinterpret_u16_p8 (arg0_poly8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu16_s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu16_s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu16_s16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_uint16x4_t = vreinterpret_u16_s16 (arg0_int16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu16_s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu16_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu16_s32 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_uint16x4_t = vreinterpret_u16_s32 (arg0_int32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu16_s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu16_s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu16_s64 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  int64x1_t arg0_int64x1_t;
--
--  out_uint16x4_t = vreinterpret_u16_s64 (arg0_int64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu16_s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu16_s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu16_s8 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_uint16x4_t = vreinterpret_u16_s8 (arg0_int8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu16_u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu16_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu16_u32 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_uint16x4_t = vreinterpret_u16_u32 (arg0_uint32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu16_u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu16_u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu16_u64 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint64x1_t arg0_uint64x1_t;
--
--  out_uint16x4_t = vreinterpret_u16_u64 (arg0_uint64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu16_u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu16_u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu16_u8 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_uint16x4_t = vreinterpret_u16_u8 (arg0_uint8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu32_f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu32_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu32_f32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_uint32x2_t = vreinterpret_u32_f32 (arg0_float32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu32_p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu32_p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu32_p16 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  poly16x4_t arg0_poly16x4_t;
--
--  out_uint32x2_t = vreinterpret_u32_p16 (arg0_poly16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu32_p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu32_p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu32_p64 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  poly64x1_t arg0_poly64x1_t;
--
--  out_uint32x2_t = vreinterpret_u32_p64 (arg0_poly64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu32_p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu32_p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu32_p8 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  poly8x8_t arg0_poly8x8_t;
--
--  out_uint32x2_t = vreinterpret_u32_p8 (arg0_poly8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu32_s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu32_s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu32_s16 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_uint32x2_t = vreinterpret_u32_s16 (arg0_int16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu32_s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu32_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu32_s32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_uint32x2_t = vreinterpret_u32_s32 (arg0_int32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu32_s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu32_s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu32_s64 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  int64x1_t arg0_int64x1_t;
--
--  out_uint32x2_t = vreinterpret_u32_s64 (arg0_int64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu32_s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu32_s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu32_s8 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_uint32x2_t = vreinterpret_u32_s8 (arg0_int8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu32_u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu32_u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu32_u16 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_uint32x2_t = vreinterpret_u32_u16 (arg0_uint16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu32_u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu32_u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu32_u64 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint64x1_t arg0_uint64x1_t;
--
--  out_uint32x2_t = vreinterpret_u32_u64 (arg0_uint64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu32_u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu32_u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu32_u8 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_uint32x2_t = vreinterpret_u32_u8 (arg0_uint8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu64_f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu64_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu64_f32 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_uint64x1_t = vreinterpret_u64_f32 (arg0_float32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu64_p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu64_p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu64_p16 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  poly16x4_t arg0_poly16x4_t;
--
--  out_uint64x1_t = vreinterpret_u64_p16 (arg0_poly16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu64_p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu64_p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu64_p64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  poly64x1_t arg0_poly64x1_t;
--
--  out_uint64x1_t = vreinterpret_u64_p64 (arg0_poly64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu64_p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu64_p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu64_p8 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  poly8x8_t arg0_poly8x8_t;
--
--  out_uint64x1_t = vreinterpret_u64_p8 (arg0_poly8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu64_s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu64_s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu64_s16 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_uint64x1_t = vreinterpret_u64_s16 (arg0_int16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu64_s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu64_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu64_s32 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_uint64x1_t = vreinterpret_u64_s32 (arg0_int32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu64_s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu64_s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu64_s64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  int64x1_t arg0_int64x1_t;
--
--  out_uint64x1_t = vreinterpret_u64_s64 (arg0_int64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu64_s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu64_s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu64_s8 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_uint64x1_t = vreinterpret_u64_s8 (arg0_int8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu64_u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu64_u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu64_u16 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_uint64x1_t = vreinterpret_u64_u16 (arg0_uint16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu64_u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu64_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu64_u32 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_uint64x1_t = vreinterpret_u64_u32 (arg0_uint32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu64_u8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu64_u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu64_u8 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_uint64x1_t = vreinterpret_u64_u8 (arg0_uint8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu8_f32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu8_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu8_f32 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_uint8x8_t = vreinterpret_u8_f32 (arg0_float32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu8_p16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu8_p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu8_p16 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  poly16x4_t arg0_poly16x4_t;
--
--  out_uint8x8_t = vreinterpret_u8_p16 (arg0_poly16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu8_p64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu8_p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu8_p64 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  poly64x1_t arg0_poly64x1_t;
--
--  out_uint8x8_t = vreinterpret_u8_p64 (arg0_poly64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu8_p8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu8_p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu8_p8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  poly8x8_t arg0_poly8x8_t;
--
--  out_uint8x8_t = vreinterpret_u8_p8 (arg0_poly8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu8_s16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu8_s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu8_s16 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_uint8x8_t = vreinterpret_u8_s16 (arg0_int16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu8_s32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu8_s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu8_s32 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_uint8x8_t = vreinterpret_u8_s32 (arg0_int32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu8_s64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu8_s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu8_s64 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  int64x1_t arg0_int64x1_t;
--
--  out_uint8x8_t = vreinterpret_u8_s64 (arg0_int64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu8_s8.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu8_s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu8_s8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_uint8x8_t = vreinterpret_u8_s8 (arg0_int8x8_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu8_u16.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu8_u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu8_u16 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_uint8x8_t = vreinterpret_u8_u16 (arg0_uint16x4_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu8_u32.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu8_u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu8_u32 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_uint8x8_t = vreinterpret_u8_u32 (arg0_uint32x2_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vreinterpretu8_u64.c
-+++ b/src//dev/null
-@@ -1,18 +0,0 @@
--/* Test the `vreinterpretu8_u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vreinterpretu8_u64 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint64x1_t arg0_uint64x1_t;
--
--  out_uint8x8_t = vreinterpret_u8_u64 (arg0_uint64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev16Qp8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev16Qp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev16Qp8 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  poly8x16_t arg0_poly8x16_t;
--
--  out_poly8x16_t = vrev16q_p8 (arg0_poly8x16_t);
--}
--
--/* { dg-final { scan-assembler "vrev16\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev16Qs8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev16Qs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev16Qs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_int8x16_t = vrev16q_s8 (arg0_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vrev16\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev16Qu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev16Qu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev16Qu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_uint8x16_t = vrev16q_u8 (arg0_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vrev16\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev16p8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev16p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev16p8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly8x8_t arg0_poly8x8_t;
--
--  out_poly8x8_t = vrev16_p8 (arg0_poly8x8_t);
--}
--
--/* { dg-final { scan-assembler "vrev16\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev16s8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev16s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev16s8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_int8x8_t = vrev16_s8 (arg0_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vrev16\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev16u8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev16u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev16u8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_uint8x8_t = vrev16_u8 (arg0_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vrev16\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev32Qp16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev32Qp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev32Qp16 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  poly16x8_t arg0_poly16x8_t;
--
--  out_poly16x8_t = vrev32q_p16 (arg0_poly16x8_t);
--}
--
--/* { dg-final { scan-assembler "vrev32\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev32Qp8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev32Qp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev32Qp8 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  poly8x16_t arg0_poly8x16_t;
--
--  out_poly8x16_t = vrev32q_p8 (arg0_poly8x16_t);
--}
--
--/* { dg-final { scan-assembler "vrev32\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev32Qs16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev32Qs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev32Qs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int16x8_t = vrev32q_s16 (arg0_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vrev32\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev32Qs8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev32Qs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev32Qs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_int8x16_t = vrev32q_s8 (arg0_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vrev32\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev32Qu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev32Qu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev32Qu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_uint16x8_t = vrev32q_u16 (arg0_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vrev32\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev32Qu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev32Qu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev32Qu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_uint8x16_t = vrev32q_u8 (arg0_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vrev32\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev32p16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev32p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev32p16 (void)
--{
--  poly16x4_t out_poly16x4_t;
--  poly16x4_t arg0_poly16x4_t;
--
--  out_poly16x4_t = vrev32_p16 (arg0_poly16x4_t);
--}
--
--/* { dg-final { scan-assembler "vrev32\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev32p8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev32p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev32p8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly8x8_t arg0_poly8x8_t;
--
--  out_poly8x8_t = vrev32_p8 (arg0_poly8x8_t);
--}
--
--/* { dg-final { scan-assembler "vrev32\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev32s16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev32s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev32s16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_int16x4_t = vrev32_s16 (arg0_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vrev32\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev32s8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev32s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev32s8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_int8x8_t = vrev32_s8 (arg0_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vrev32\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev32u16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev32u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev32u16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_uint16x4_t = vrev32_u16 (arg0_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vrev32\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev32u8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev32u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev32u8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_uint8x8_t = vrev32_u8 (arg0_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vrev32\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64Qf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev64Qf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev64Qf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_float32x4_t = vrev64q_f32 (arg0_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vrev64\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64Qp16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev64Qp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev64Qp16 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  poly16x8_t arg0_poly16x8_t;
--
--  out_poly16x8_t = vrev64q_p16 (arg0_poly16x8_t);
--}
--
--/* { dg-final { scan-assembler "vrev64\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64Qp8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev64Qp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev64Qp8 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  poly8x16_t arg0_poly8x16_t;
--
--  out_poly8x16_t = vrev64q_p8 (arg0_poly8x16_t);
--}
--
--/* { dg-final { scan-assembler "vrev64\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64Qs16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev64Qs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev64Qs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int16x8_t = vrev64q_s16 (arg0_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vrev64\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64Qs32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev64Qs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev64Qs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_int32x4_t = vrev64q_s32 (arg0_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vrev64\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64Qs8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev64Qs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev64Qs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_int8x16_t = vrev64q_s8 (arg0_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vrev64\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64Qu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev64Qu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev64Qu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_uint16x8_t = vrev64q_u16 (arg0_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vrev64\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64Qu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev64Qu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev64Qu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_uint32x4_t = vrev64q_u32 (arg0_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vrev64\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64Qu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev64Qu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev64Qu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_uint8x16_t = vrev64q_u8 (arg0_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vrev64\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64f32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev64f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev64f32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_float32x2_t = vrev64_f32 (arg0_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vrev64\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64p16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev64p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev64p16 (void)
--{
--  poly16x4_t out_poly16x4_t;
--  poly16x4_t arg0_poly16x4_t;
--
--  out_poly16x4_t = vrev64_p16 (arg0_poly16x4_t);
--}
--
--/* { dg-final { scan-assembler "vrev64\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64p8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev64p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev64p8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly8x8_t arg0_poly8x8_t;
--
--  out_poly8x8_t = vrev64_p8 (arg0_poly8x8_t);
--}
--
--/* { dg-final { scan-assembler "vrev64\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64s16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev64s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev64s16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_int16x4_t = vrev64_s16 (arg0_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vrev64\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64s32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev64s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev64s32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_int32x2_t = vrev64_s32 (arg0_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vrev64\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64s8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev64s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev64s8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_int8x8_t = vrev64_s8 (arg0_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vrev64\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64u16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev64u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev64u16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_uint16x4_t = vrev64_u16 (arg0_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vrev64\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64u32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev64u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev64u32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_uint32x2_t = vrev64_u32 (arg0_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vrev64\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrev64u8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrev64u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrev64u8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_uint8x8_t = vrev64_u8 (arg0_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vrev64\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrndaf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrndaf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_v8_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_v8_neon } */
--
--#include "arm_neon.h"
--
--void test_vrndaf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_float32x2_t = vrnda_f32 (arg0_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vrinta\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrndaqf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrndaq_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_v8_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_v8_neon } */
--
--#include "arm_neon.h"
--
--void test_vrndaqf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_float32x4_t = vrndaq_f32 (arg0_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vrinta\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrndf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrndf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_v8_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_v8_neon } */
--
--#include "arm_neon.h"
--
--void test_vrndf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_float32x2_t = vrnd_f32 (arg0_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vrintz\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrndmf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrndmf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_v8_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_v8_neon } */
--
--#include "arm_neon.h"
--
--void test_vrndmf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_float32x2_t = vrndm_f32 (arg0_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vrintm\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrndmqf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrndmq_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_v8_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_v8_neon } */
--
--#include "arm_neon.h"
--
--void test_vrndmqf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_float32x4_t = vrndmq_f32 (arg0_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vrintm\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrndnf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrndnf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_v8_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_v8_neon } */
--
--#include "arm_neon.h"
--
--void test_vrndnf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_float32x2_t = vrndn_f32 (arg0_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vrintn\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrndnqf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrndnq_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_v8_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_v8_neon } */
--
--#include "arm_neon.h"
--
--void test_vrndnqf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_float32x4_t = vrndnq_f32 (arg0_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vrintn\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrndpf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrndpf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_v8_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_v8_neon } */
--
--#include "arm_neon.h"
--
--void test_vrndpf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_float32x2_t = vrndp_f32 (arg0_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vrintp\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrndpqf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrndpq_f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_v8_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_v8_neon } */
--
--#include "arm_neon.h"
--
--void test_vrndpqf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_float32x4_t = vrndpq_f32 (arg0_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vrintp\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrndqf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrndqf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_v8_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_v8_neon } */
--
--#include "arm_neon.h"
--
--void test_vrndqf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_float32x4_t = vrndq_f32 (arg0_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vrintz\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrsqrteQf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrsqrteQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrsqrteQf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--
--  out_float32x4_t = vrsqrteq_f32 (arg0_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vrsqrte\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrsqrteQu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrsqrteQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrsqrteQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_uint32x4_t = vrsqrteq_u32 (arg0_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vrsqrte\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrsqrtef32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrsqrtef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrsqrtef32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--
--  out_float32x2_t = vrsqrte_f32 (arg0_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vrsqrte\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrsqrteu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vrsqrteu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrsqrteu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_uint32x2_t = vrsqrte_u32 (arg0_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vrsqrte\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrsqrtsQf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vrsqrtsQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrsqrtsQf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--
--  out_float32x4_t = vrsqrtsq_f32 (arg0_float32x4_t, arg1_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vrsqrts\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vrsqrtsf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vrsqrtsf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vrsqrtsf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_float32x2_t = vrsqrts_f32 (arg0_float32x2_t, arg1_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vrsqrts\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsetQ_lanef32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsetQ_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsetQ_lanef32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32_t arg0_float32_t;
--  float32x4_t arg1_float32x4_t;
--
--  out_float32x4_t = vsetq_lane_f32 (arg0_float32_t, arg1_float32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.32\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsetQ_lanep16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsetQ_lanep16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsetQ_lanep16 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  poly16_t arg0_poly16_t;
--  poly16x8_t arg1_poly16x8_t;
--
--  out_poly16x8_t = vsetq_lane_p16 (arg0_poly16_t, arg1_poly16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.16\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsetQ_lanep8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsetQ_lanep8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsetQ_lanep8 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  poly8_t arg0_poly8_t;
--  poly8x16_t arg1_poly8x16_t;
--
--  out_poly8x16_t = vsetq_lane_p8 (arg0_poly8_t, arg1_poly8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.8\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsetQ_lanes16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsetQ_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsetQ_lanes16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16_t arg0_int16_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vsetq_lane_s16 (arg0_int16_t, arg1_int16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.16\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsetQ_lanes32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsetQ_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsetQ_lanes32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32_t arg0_int32_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vsetq_lane_s32 (arg0_int32_t, arg1_int32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.32\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsetQ_lanes64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsetQ_lanes64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsetQ_lanes64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64_t arg0_int64_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_int64x2_t = vsetq_lane_s64 (arg0_int64_t, arg1_int64x2_t, 0);
--}
--
--/* { dg-final { scan-assembler "vmov\[ 	\]+\[dD\]\[0-9\]+, \[rR\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsetQ_lanes8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsetQ_lanes8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsetQ_lanes8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8_t arg0_int8_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = vsetq_lane_s8 (arg0_int8_t, arg1_int8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.8\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsetQ_laneu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsetQ_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsetQ_laneu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16_t arg0_uint16_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vsetq_lane_u16 (arg0_uint16_t, arg1_uint16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.16\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsetQ_laneu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsetQ_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsetQ_laneu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32_t arg0_uint32_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vsetq_lane_u32 (arg0_uint32_t, arg1_uint32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.32\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsetQ_laneu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsetQ_laneu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsetQ_laneu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64_t arg0_uint64_t;
--  uint64x2_t arg1_uint64x2_t;
--
--  out_uint64x2_t = vsetq_lane_u64 (arg0_uint64_t, arg1_uint64x2_t, 0);
--}
--
--/* { dg-final { scan-assembler "vmov\[ 	\]+\[dD\]\[0-9\]+, \[rR\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsetQ_laneu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsetQ_laneu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsetQ_laneu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8_t arg0_uint8_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vsetq_lane_u8 (arg0_uint8_t, arg1_uint8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.8\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vset_lanef32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vset_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vset_lanef32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32_t arg0_float32_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_float32x2_t = vset_lane_f32 (arg0_float32_t, arg1_float32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.32\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vset_lanep16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vset_lanep16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vset_lanep16 (void)
--{
--  poly16x4_t out_poly16x4_t;
--  poly16_t arg0_poly16_t;
--  poly16x4_t arg1_poly16x4_t;
--
--  out_poly16x4_t = vset_lane_p16 (arg0_poly16_t, arg1_poly16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.16\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vset_lanep8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vset_lanep8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vset_lanep8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly8_t arg0_poly8_t;
--  poly8x8_t arg1_poly8x8_t;
--
--  out_poly8x8_t = vset_lane_p8 (arg0_poly8_t, arg1_poly8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.8\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vset_lanes16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vset_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vset_lanes16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16_t arg0_int16_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vset_lane_s16 (arg0_int16_t, arg1_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.16\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vset_lanes32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vset_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vset_lanes32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32_t arg0_int32_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vset_lane_s32 (arg0_int32_t, arg1_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.32\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vset_lanes64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vset_lanes64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vset_lanes64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64_t arg0_int64_t;
--  int64x1_t arg1_int64x1_t;
--
--  out_int64x1_t = vset_lane_s64 (arg0_int64_t, arg1_int64x1_t, 0);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vset_lanes8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vset_lanes8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vset_lanes8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8_t arg0_int8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vset_lane_s8 (arg0_int8_t, arg1_int8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.8\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vset_laneu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vset_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vset_laneu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16_t arg0_uint16_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vset_lane_u16 (arg0_uint16_t, arg1_uint16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.16\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vset_laneu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vset_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vset_laneu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32_t arg0_uint32_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vset_lane_u32 (arg0_uint32_t, arg1_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.32\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vset_laneu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vset_laneu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vset_laneu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64_t arg0_uint64_t;
--  uint64x1_t arg1_uint64x1_t;
--
--  out_uint64x1_t = vset_lane_u64 (arg0_uint64_t, arg1_uint64x1_t, 0);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vset_laneu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vset_laneu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vset_laneu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8_t arg0_uint8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vset_lane_u8 (arg0_uint8_t, arg1_uint8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vmov\.8\[ 	\]+\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQ_ns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshlQ_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshlQ_ns16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int16x8_t = vshlq_n_s16 (arg0_int16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshl\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQ_ns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshlQ_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshlQ_ns32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_int32x4_t = vshlq_n_s32 (arg0_int32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshl\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQ_ns64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshlQ_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshlQ_ns64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_int64x2_t = vshlq_n_s64 (arg0_int64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshl\.i64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQ_ns8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshlQ_ns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshlQ_ns8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_int8x16_t = vshlq_n_s8 (arg0_int8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshl\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQ_nu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshlQ_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshlQ_nu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_uint16x8_t = vshlq_n_u16 (arg0_uint16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshl\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQ_nu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshlQ_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshlQ_nu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_uint32x4_t = vshlq_n_u32 (arg0_uint32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshl\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQ_nu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshlQ_nu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshlQ_nu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--
--  out_uint64x2_t = vshlq_n_u64 (arg0_uint64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshl\.i64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQ_nu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshlQ_nu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshlQ_nu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_uint8x16_t = vshlq_n_u8 (arg0_uint8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshl\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vshlQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshlQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vshlq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vshl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vshlQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshlQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vshlq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vshl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQs64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vshlQs64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshlQs64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_int64x2_t = vshlq_s64 (arg0_int64x2_t, arg1_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "vshl\.s64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vshlQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshlQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = vshlq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vshl\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vshlQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshlQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_uint16x8_t = vshlq_u16 (arg0_uint16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vshl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vshlQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshlQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_uint32x4_t = vshlq_u32 (arg0_uint32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vshl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vshlQu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshlQu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_uint64x2_t = vshlq_u64 (arg0_uint64x2_t, arg1_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "vshl\.u64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshlQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vshlQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshlQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_uint8x16_t = vshlq_u8 (arg0_uint8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vshl\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshl_ns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshl_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshl_ns16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_int16x4_t = vshl_n_s16 (arg0_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshl\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshl_ns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshl_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshl_ns32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_int32x2_t = vshl_n_s32 (arg0_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshl\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshl_ns64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshl_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshl_ns64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x1_t arg0_int64x1_t;
--
--  out_int64x1_t = vshl_n_s64 (arg0_int64x1_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshl\.i64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshl_ns8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshl_ns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshl_ns8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_int8x8_t = vshl_n_s8 (arg0_int8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshl\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshl_nu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshl_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshl_nu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_uint16x4_t = vshl_n_u16 (arg0_uint16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshl\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshl_nu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshl_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshl_nu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_uint32x2_t = vshl_n_u32 (arg0_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshl\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshl_nu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshl_nu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshl_nu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--
--  out_uint64x1_t = vshl_n_u64 (arg0_uint64x1_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshl\.i64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshl_nu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshl_nu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshl_nu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_uint8x8_t = vshl_n_u8 (arg0_uint8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshl\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshll_ns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshll_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshll_ns16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_int32x4_t = vshll_n_s16 (arg0_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshll\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshll_ns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshll_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshll_ns32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_int64x2_t = vshll_n_s32 (arg0_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshll\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshll_ns8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshll_ns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshll_ns8 (void)
--{
--  int16x8_t out_int16x8_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_int16x8_t = vshll_n_s8 (arg0_int8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshll\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshll_nu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshll_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshll_nu16 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_uint32x4_t = vshll_n_u16 (arg0_uint16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshll\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshll_nu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshll_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshll_nu32 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_uint64x2_t = vshll_n_u32 (arg0_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshll\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshll_nu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshll_nu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshll_nu8 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_uint16x8_t = vshll_n_u8 (arg0_uint8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshll\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshls16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vshls16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshls16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vshl_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vshl\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshls32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vshls32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshls32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vshl_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vshl\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshls64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vshls64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshls64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x1_t arg0_int64x1_t;
--  int64x1_t arg1_int64x1_t;
--
--  out_int64x1_t = vshl_s64 (arg0_int64x1_t, arg1_int64x1_t);
--}
--
--/* { dg-final { scan-assembler "vshl\.s64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshls8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vshls8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshls8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vshl_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vshl\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshlu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vshlu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshlu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_uint16x4_t = vshl_u16 (arg0_uint16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vshl\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshlu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vshlu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshlu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_uint32x2_t = vshl_u32 (arg0_uint32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vshl\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshlu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vshlu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshlu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--  int64x1_t arg1_int64x1_t;
--
--  out_uint64x1_t = vshl_u64 (arg0_uint64x1_t, arg1_int64x1_t);
--}
--
--/* { dg-final { scan-assembler "vshl\.u64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshlu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vshlu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshlu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_uint8x8_t = vshl_u8 (arg0_uint8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vshl\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshrQ_ns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshrQ_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshrQ_ns16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int16x8_t = vshrq_n_s16 (arg0_int16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshr\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshrQ_ns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshrQ_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshrQ_ns32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_int32x4_t = vshrq_n_s32 (arg0_int32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshr\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshrQ_ns64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshrQ_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshrQ_ns64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_int64x2_t = vshrq_n_s64 (arg0_int64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshr\.s64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshrQ_ns8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshrQ_ns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshrQ_ns8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--
--  out_int8x16_t = vshrq_n_s8 (arg0_int8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshr\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshrQ_nu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshrQ_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshrQ_nu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_uint16x8_t = vshrq_n_u16 (arg0_uint16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshr\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshrQ_nu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshrQ_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshrQ_nu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_uint32x4_t = vshrq_n_u32 (arg0_uint32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshr\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshrQ_nu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshrQ_nu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshrQ_nu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--
--  out_uint64x2_t = vshrq_n_u64 (arg0_uint64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshr\.u64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshrQ_nu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshrQ_nu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshrQ_nu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--
--  out_uint8x16_t = vshrq_n_u8 (arg0_uint8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshr\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshr_ns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshr_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshr_ns16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--
--  out_int16x4_t = vshr_n_s16 (arg0_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshr\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshr_ns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshr_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshr_ns32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--
--  out_int32x2_t = vshr_n_s32 (arg0_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshr\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshr_ns64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshr_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshr_ns64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x1_t arg0_int64x1_t;
--
--  out_int64x1_t = vshr_n_s64 (arg0_int64x1_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshr\.s64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshr_ns8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshr_ns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshr_ns8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--
--  out_int8x8_t = vshr_n_s8 (arg0_int8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshr\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshr_nu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshr_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshr_nu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--
--  out_uint16x4_t = vshr_n_u16 (arg0_uint16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshr\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshr_nu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshr_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshr_nu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--
--  out_uint32x2_t = vshr_n_u32 (arg0_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshr\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshr_nu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshr_nu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshr_nu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--
--  out_uint64x1_t = vshr_n_u64 (arg0_uint64x1_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshr\.u64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshr_nu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshr_nu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshr_nu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--
--  out_uint8x8_t = vshr_n_u8 (arg0_uint8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshr\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshrn_ns16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshrn_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshrn_ns16 (void)
--{
--  int8x8_t out_int8x8_t;
--  int16x8_t arg0_int16x8_t;
--
--  out_int8x8_t = vshrn_n_s16 (arg0_int16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshrn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshrn_ns32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshrn_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshrn_ns32 (void)
--{
--  int16x4_t out_int16x4_t;
--  int32x4_t arg0_int32x4_t;
--
--  out_int16x4_t = vshrn_n_s32 (arg0_int32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshrn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshrn_ns64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshrn_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshrn_ns64 (void)
--{
--  int32x2_t out_int32x2_t;
--  int64x2_t arg0_int64x2_t;
--
--  out_int32x2_t = vshrn_n_s64 (arg0_int64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshrn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshrn_nu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshrn_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshrn_nu16 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint16x8_t arg0_uint16x8_t;
--
--  out_uint8x8_t = vshrn_n_u16 (arg0_uint16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshrn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshrn_nu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshrn_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshrn_nu32 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint32x4_t arg0_uint32x4_t;
--
--  out_uint16x4_t = vshrn_n_u32 (arg0_uint32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshrn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vshrn_nu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vshrn_nu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vshrn_nu64 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint64x2_t arg0_uint64x2_t;
--
--  out_uint32x2_t = vshrn_n_u64 (arg0_uint64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vshrn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsliQ_np16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsliQ_np16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsliQ_np16 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  poly16x8_t arg0_poly16x8_t;
--  poly16x8_t arg1_poly16x8_t;
--
--  out_poly16x8_t = vsliq_n_p16 (arg0_poly16x8_t, arg1_poly16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsli\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsliQ_np64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsliQ_np64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vsliQ_np64 (void)
--{
--  poly64x2_t out_poly64x2_t;
--  poly64x2_t arg0_poly64x2_t;
--  poly64x2_t arg1_poly64x2_t;
--
--  out_poly64x2_t = vsliq_n_p64 (arg0_poly64x2_t, arg1_poly64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsli\.64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsliQ_np8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsliQ_np8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsliQ_np8 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  poly8x16_t arg0_poly8x16_t;
--  poly8x16_t arg1_poly8x16_t;
--
--  out_poly8x16_t = vsliq_n_p8 (arg0_poly8x16_t, arg1_poly8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsli\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsliQ_ns16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsliQ_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsliQ_ns16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vsliq_n_s16 (arg0_int16x8_t, arg1_int16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsli\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsliQ_ns32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsliQ_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsliQ_ns32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vsliq_n_s32 (arg0_int32x4_t, arg1_int32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsli\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsliQ_ns64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsliQ_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsliQ_ns64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_int64x2_t = vsliq_n_s64 (arg0_int64x2_t, arg1_int64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsli\.64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsliQ_ns8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsliQ_ns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsliQ_ns8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = vsliq_n_s8 (arg0_int8x16_t, arg1_int8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsli\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsliQ_nu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsliQ_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsliQ_nu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vsliq_n_u16 (arg0_uint16x8_t, arg1_uint16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsli\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsliQ_nu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsliQ_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsliQ_nu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vsliq_n_u32 (arg0_uint32x4_t, arg1_uint32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsli\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsliQ_nu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsliQ_nu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsliQ_nu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint64x2_t arg1_uint64x2_t;
--
--  out_uint64x2_t = vsliq_n_u64 (arg0_uint64x2_t, arg1_uint64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsli\.64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsliQ_nu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsliQ_nu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsliQ_nu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vsliq_n_u8 (arg0_uint8x16_t, arg1_uint8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsli\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsli_np16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsli_np16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsli_np16 (void)
--{
--  poly16x4_t out_poly16x4_t;
--  poly16x4_t arg0_poly16x4_t;
--  poly16x4_t arg1_poly16x4_t;
--
--  out_poly16x4_t = vsli_n_p16 (arg0_poly16x4_t, arg1_poly16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsli\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsli_np64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsli_np64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vsli_np64 (void)
--{
--  poly64x1_t out_poly64x1_t;
--  poly64x1_t arg0_poly64x1_t;
--  poly64x1_t arg1_poly64x1_t;
--
--  out_poly64x1_t = vsli_n_p64 (arg0_poly64x1_t, arg1_poly64x1_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsli\.64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsli_np8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsli_np8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsli_np8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly8x8_t arg0_poly8x8_t;
--  poly8x8_t arg1_poly8x8_t;
--
--  out_poly8x8_t = vsli_n_p8 (arg0_poly8x8_t, arg1_poly8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsli\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsli_ns16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsli_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsli_ns16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vsli_n_s16 (arg0_int16x4_t, arg1_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsli\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsli_ns32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsli_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsli_ns32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vsli_n_s32 (arg0_int32x2_t, arg1_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsli\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsli_ns64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsli_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsli_ns64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x1_t arg0_int64x1_t;
--  int64x1_t arg1_int64x1_t;
--
--  out_int64x1_t = vsli_n_s64 (arg0_int64x1_t, arg1_int64x1_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsli\.64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsli_ns8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsli_ns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsli_ns8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vsli_n_s8 (arg0_int8x8_t, arg1_int8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsli\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsli_nu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsli_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsli_nu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vsli_n_u16 (arg0_uint16x4_t, arg1_uint16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsli\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsli_nu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsli_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsli_nu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vsli_n_u32 (arg0_uint32x2_t, arg1_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsli\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsli_nu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsli_nu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsli_nu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--  uint64x1_t arg1_uint64x1_t;
--
--  out_uint64x1_t = vsli_n_u64 (arg0_uint64x1_t, arg1_uint64x1_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsli\.64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsli_nu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsli_nu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsli_nu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vsli_n_u8 (arg0_uint8x8_t, arg1_uint8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsli\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsraQ_ns16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsraQ_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsraQ_ns16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vsraq_n_s16 (arg0_int16x8_t, arg1_int16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsra\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsraQ_ns32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsraQ_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsraQ_ns32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vsraq_n_s32 (arg0_int32x4_t, arg1_int32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsra\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsraQ_ns64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsraQ_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsraQ_ns64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_int64x2_t = vsraq_n_s64 (arg0_int64x2_t, arg1_int64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsra\.s64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsraQ_ns8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsraQ_ns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsraQ_ns8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = vsraq_n_s8 (arg0_int8x16_t, arg1_int8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsra\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsraQ_nu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsraQ_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsraQ_nu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vsraq_n_u16 (arg0_uint16x8_t, arg1_uint16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsra\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsraQ_nu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsraQ_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsraQ_nu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vsraq_n_u32 (arg0_uint32x4_t, arg1_uint32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsra\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsraQ_nu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsraQ_nu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsraQ_nu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint64x2_t arg1_uint64x2_t;
--
--  out_uint64x2_t = vsraq_n_u64 (arg0_uint64x2_t, arg1_uint64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsra\.u64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsraQ_nu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsraQ_nu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsraQ_nu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vsraq_n_u8 (arg0_uint8x16_t, arg1_uint8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsra\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsra_ns16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsra_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsra_ns16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vsra_n_s16 (arg0_int16x4_t, arg1_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsra\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsra_ns32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsra_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsra_ns32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vsra_n_s32 (arg0_int32x2_t, arg1_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsra\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsra_ns64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsra_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsra_ns64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x1_t arg0_int64x1_t;
--  int64x1_t arg1_int64x1_t;
--
--  out_int64x1_t = vsra_n_s64 (arg0_int64x1_t, arg1_int64x1_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsra\.s64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsra_ns8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsra_ns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsra_ns8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vsra_n_s8 (arg0_int8x8_t, arg1_int8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsra\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsra_nu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsra_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsra_nu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vsra_n_u16 (arg0_uint16x4_t, arg1_uint16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsra\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsra_nu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsra_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsra_nu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vsra_n_u32 (arg0_uint32x2_t, arg1_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsra\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsra_nu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsra_nu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsra_nu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--  uint64x1_t arg1_uint64x1_t;
--
--  out_uint64x1_t = vsra_n_u64 (arg0_uint64x1_t, arg1_uint64x1_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsra\.u64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsra_nu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsra_nu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsra_nu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vsra_n_u8 (arg0_uint8x8_t, arg1_uint8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsra\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsriQ_np16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsriQ_np16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsriQ_np16 (void)
--{
--  poly16x8_t out_poly16x8_t;
--  poly16x8_t arg0_poly16x8_t;
--  poly16x8_t arg1_poly16x8_t;
--
--  out_poly16x8_t = vsriq_n_p16 (arg0_poly16x8_t, arg1_poly16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsri\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsriQ_np64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsriQ_np64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vsriQ_np64 (void)
--{
--  poly64x2_t out_poly64x2_t;
--  poly64x2_t arg0_poly64x2_t;
--  poly64x2_t arg1_poly64x2_t;
--
--  out_poly64x2_t = vsriq_n_p64 (arg0_poly64x2_t, arg1_poly64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsri\.64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsriQ_np8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsriQ_np8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsriQ_np8 (void)
--{
--  poly8x16_t out_poly8x16_t;
--  poly8x16_t arg0_poly8x16_t;
--  poly8x16_t arg1_poly8x16_t;
--
--  out_poly8x16_t = vsriq_n_p8 (arg0_poly8x16_t, arg1_poly8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsri\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsriQ_ns16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsriQ_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsriQ_ns16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vsriq_n_s16 (arg0_int16x8_t, arg1_int16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsri\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsriQ_ns32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsriQ_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsriQ_ns32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vsriq_n_s32 (arg0_int32x4_t, arg1_int32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsri\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsriQ_ns64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsriQ_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsriQ_ns64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_int64x2_t = vsriq_n_s64 (arg0_int64x2_t, arg1_int64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsri\.64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsriQ_ns8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsriQ_ns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsriQ_ns8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = vsriq_n_s8 (arg0_int8x16_t, arg1_int8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsri\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsriQ_nu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsriQ_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsriQ_nu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vsriq_n_u16 (arg0_uint16x8_t, arg1_uint16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsri\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsriQ_nu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsriQ_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsriQ_nu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vsriq_n_u32 (arg0_uint32x4_t, arg1_uint32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsri\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsriQ_nu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsriQ_nu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsriQ_nu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint64x2_t arg1_uint64x2_t;
--
--  out_uint64x2_t = vsriq_n_u64 (arg0_uint64x2_t, arg1_uint64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsri\.64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsriQ_nu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsriQ_nu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsriQ_nu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vsriq_n_u8 (arg0_uint8x16_t, arg1_uint8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsri\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsri_np16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsri_np16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsri_np16 (void)
--{
--  poly16x4_t out_poly16x4_t;
--  poly16x4_t arg0_poly16x4_t;
--  poly16x4_t arg1_poly16x4_t;
--
--  out_poly16x4_t = vsri_n_p16 (arg0_poly16x4_t, arg1_poly16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsri\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsri_np64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsri_np64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vsri_np64 (void)
--{
--  poly64x1_t out_poly64x1_t;
--  poly64x1_t arg0_poly64x1_t;
--  poly64x1_t arg1_poly64x1_t;
--
--  out_poly64x1_t = vsri_n_p64 (arg0_poly64x1_t, arg1_poly64x1_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsri\.64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsri_np8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsri_np8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsri_np8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly8x8_t arg0_poly8x8_t;
--  poly8x8_t arg1_poly8x8_t;
--
--  out_poly8x8_t = vsri_n_p8 (arg0_poly8x8_t, arg1_poly8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsri\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsri_ns16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsri_ns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsri_ns16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vsri_n_s16 (arg0_int16x4_t, arg1_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsri\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsri_ns32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsri_ns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsri_ns32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vsri_n_s32 (arg0_int32x2_t, arg1_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsri\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsri_ns64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsri_ns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsri_ns64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x1_t arg0_int64x1_t;
--  int64x1_t arg1_int64x1_t;
--
--  out_int64x1_t = vsri_n_s64 (arg0_int64x1_t, arg1_int64x1_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsri\.64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsri_ns8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsri_ns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsri_ns8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vsri_n_s8 (arg0_int8x8_t, arg1_int8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsri\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsri_nu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsri_nu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsri_nu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vsri_n_u16 (arg0_uint16x4_t, arg1_uint16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsri\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsri_nu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsri_nu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsri_nu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vsri_n_u32 (arg0_uint32x2_t, arg1_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsri\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsri_nu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsri_nu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsri_nu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--  uint64x1_t arg1_uint64x1_t;
--
--  out_uint64x1_t = vsri_n_u64 (arg0_uint64x1_t, arg1_uint64x1_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsri\.64\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsri_nu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsri_nu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsri_nu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vsri_n_u8 (arg0_uint8x8_t, arg1_uint8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vsri\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_lanef32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1Q_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1Q_lanef32 (void)
--{
--  float32_t *arg0_float32_t;
--  float32x4_t arg1_float32x4_t;
--
--  vst1q_lane_f32 (arg0_float32_t, arg1_float32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_lanep16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1Q_lanep16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1Q_lanep16 (void)
--{
--  poly16_t *arg0_poly16_t;
--  poly16x8_t arg1_poly16x8_t;
--
--  vst1q_lane_p16 (arg0_poly16_t, arg1_poly16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_lanep64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1Q_lanep64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vst1Q_lanep64 (void)
--{
--  poly64_t *arg0_poly64_t;
--  poly64x2_t arg1_poly64x2_t;
--
--  vst1q_lane_p64 (arg0_poly64_t, arg1_poly64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_lanep8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1Q_lanep8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1Q_lanep8 (void)
--{
--  poly8_t *arg0_poly8_t;
--  poly8x16_t arg1_poly8x16_t;
--
--  vst1q_lane_p8 (arg0_poly8_t, arg1_poly8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_lanes16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1Q_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1Q_lanes16 (void)
--{
--  int16_t *arg0_int16_t;
--  int16x8_t arg1_int16x8_t;
--
--  vst1q_lane_s16 (arg0_int16_t, arg1_int16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_lanes32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1Q_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1Q_lanes32 (void)
--{
--  int32_t *arg0_int32_t;
--  int32x4_t arg1_int32x4_t;
--
--  vst1q_lane_s32 (arg0_int32_t, arg1_int32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_lanes64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1Q_lanes64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1Q_lanes64 (void)
--{
--  int64_t *arg0_int64_t;
--  int64x2_t arg1_int64x2_t;
--
--  vst1q_lane_s64 (arg0_int64_t, arg1_int64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_lanes8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1Q_lanes8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1Q_lanes8 (void)
--{
--  int8_t *arg0_int8_t;
--  int8x16_t arg1_int8x16_t;
--
--  vst1q_lane_s8 (arg0_int8_t, arg1_int8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_laneu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1Q_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1Q_laneu16 (void)
--{
--  uint16_t *arg0_uint16_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  vst1q_lane_u16 (arg0_uint16_t, arg1_uint16x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_laneu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1Q_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1Q_laneu32 (void)
--{
--  uint32_t *arg0_uint32_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  vst1q_lane_u32 (arg0_uint32_t, arg1_uint32x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_laneu64-1.c
-+++ b/src//dev/null
-@@ -1,25 +0,0 @@
--/* Test the `vst1Q_laneu64' ARM Neon intrinsic.  */
--
--/* Detect ICE in the case of unaligned memory address.  */
--
--/* { dg-do compile } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--unsigned char dummy_store[1000];
--
--void
--foo (char* addr)
--{
--  uint8x16_t vdata = vld1q_u8 (addr);
--  vst1q_lane_u64 ((uint64_t*) &dummy_store, vreinterpretq_u64_u8 (vdata), 0);
--}
--
--uint64_t
--bar (uint64x2_t vdata)
--{
--  vdata = vld1q_lane_u64 ((uint64_t*) &dummy_store, vdata, 0);
--  return vgetq_lane_u64 (vdata, 0);
--}
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_laneu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1Q_laneu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1Q_laneu64 (void)
--{
--  uint64_t *arg0_uint64_t;
--  uint64x2_t arg1_uint64x2_t;
--
--  vst1q_lane_u64 (arg0_uint64_t, arg1_uint64x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Q_laneu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1Q_laneu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1Q_laneu8 (void)
--{
--  uint8_t *arg0_uint8_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  vst1q_lane_u8 (arg0_uint8_t, arg1_uint8x16_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Qf32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1Qf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1Qf32 (void)
--{
--  float32_t *arg0_float32_t;
--  float32x4_t arg1_float32x4_t;
--
--  vst1q_f32 (arg0_float32_t, arg1_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Qp16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1Qp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1Qp16 (void)
--{
--  poly16_t *arg0_poly16_t;
--  poly16x8_t arg1_poly16x8_t;
--
--  vst1q_p16 (arg0_poly16_t, arg1_poly16x8_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Qp64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1Qp64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vst1Qp64 (void)
--{
--  poly64_t *arg0_poly64_t;
--  poly64x2_t arg1_poly64x2_t;
--
--  vst1q_p64 (arg0_poly64_t, arg1_poly64x2_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Qp8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1Qp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1Qp8 (void)
--{
--  poly8_t *arg0_poly8_t;
--  poly8x16_t arg1_poly8x16_t;
--
--  vst1q_p8 (arg0_poly8_t, arg1_poly8x16_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Qs16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1Qs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1Qs16 (void)
--{
--  int16_t *arg0_int16_t;
--  int16x8_t arg1_int16x8_t;
--
--  vst1q_s16 (arg0_int16_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Qs32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1Qs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1Qs32 (void)
--{
--  int32_t *arg0_int32_t;
--  int32x4_t arg1_int32x4_t;
--
--  vst1q_s32 (arg0_int32_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Qs64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1Qs64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1Qs64 (void)
--{
--  int64_t *arg0_int64_t;
--  int64x2_t arg1_int64x2_t;
--
--  vst1q_s64 (arg0_int64_t, arg1_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Qs8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1Qs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1Qs8 (void)
--{
--  int8_t *arg0_int8_t;
--  int8x16_t arg1_int8x16_t;
--
--  vst1q_s8 (arg0_int8_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Qu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1Qu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1Qu16 (void)
--{
--  uint16_t *arg0_uint16_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  vst1q_u16 (arg0_uint16_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Qu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1Qu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1Qu32 (void)
--{
--  uint32_t *arg0_uint32_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  vst1q_u32 (arg0_uint32_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Qu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1Qu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1Qu64 (void)
--{
--  uint64_t *arg0_uint64_t;
--  uint64x2_t arg1_uint64x2_t;
--
--  vst1q_u64 (arg0_uint64_t, arg1_uint64x2_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1Qu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1Qu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1Qu8 (void)
--{
--  uint8_t *arg0_uint8_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  vst1q_u8 (arg0_uint8_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1_lanef32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1_lanef32 (void)
--{
--  float32_t *arg0_float32_t;
--  float32x2_t arg1_float32x2_t;
--
--  vst1_lane_f32 (arg0_float32_t, arg1_float32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1_lanep16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1_lanep16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1_lanep16 (void)
--{
--  poly16_t *arg0_poly16_t;
--  poly16x4_t arg1_poly16x4_t;
--
--  vst1_lane_p16 (arg0_poly16_t, arg1_poly16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1_lanep64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1_lanep64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vst1_lanep64 (void)
--{
--  poly64_t *arg0_poly64_t;
--  poly64x1_t arg1_poly64x1_t;
--
--  vst1_lane_p64 (arg0_poly64_t, arg1_poly64x1_t, 0);
--}
--
--/* { dg-final { scan-assembler "vst1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1_lanep8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1_lanep8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1_lanep8 (void)
--{
--  poly8_t *arg0_poly8_t;
--  poly8x8_t arg1_poly8x8_t;
--
--  vst1_lane_p8 (arg0_poly8_t, arg1_poly8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1_lanes16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1_lanes16 (void)
--{
--  int16_t *arg0_int16_t;
--  int16x4_t arg1_int16x4_t;
--
--  vst1_lane_s16 (arg0_int16_t, arg1_int16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1_lanes32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1_lanes32 (void)
--{
--  int32_t *arg0_int32_t;
--  int32x2_t arg1_int32x2_t;
--
--  vst1_lane_s32 (arg0_int32_t, arg1_int32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1_lanes64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1_lanes64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1_lanes64 (void)
--{
--  int64_t *arg0_int64_t;
--  int64x1_t arg1_int64x1_t;
--
--  vst1_lane_s64 (arg0_int64_t, arg1_int64x1_t, 0);
--}
--
--/* { dg-final { scan-assembler "vst1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1_lanes8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1_lanes8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1_lanes8 (void)
--{
--  int8_t *arg0_int8_t;
--  int8x8_t arg1_int8x8_t;
--
--  vst1_lane_s8 (arg0_int8_t, arg1_int8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1_laneu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1_laneu16 (void)
--{
--  uint16_t *arg0_uint16_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  vst1_lane_u16 (arg0_uint16_t, arg1_uint16x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1_laneu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1_laneu32 (void)
--{
--  uint32_t *arg0_uint32_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  vst1_lane_u32 (arg0_uint32_t, arg1_uint32x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1_laneu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1_laneu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1_laneu64 (void)
--{
--  uint64_t *arg0_uint64_t;
--  uint64x1_t arg1_uint64x1_t;
--
--  vst1_lane_u64 (arg0_uint64_t, arg1_uint64x1_t, 0);
--}
--
--/* { dg-final { scan-assembler "vst1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1_laneu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1_laneu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1_laneu8 (void)
--{
--  uint8_t *arg0_uint8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  vst1_lane_u8 (arg0_uint8_t, arg1_uint8x8_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]\\\})|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1f32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1f32 (void)
--{
--  float32_t *arg0_float32_t;
--  float32x2_t arg1_float32x2_t;
--
--  vst1_f32 (arg0_float32_t, arg1_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1p16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1p16 (void)
--{
--  poly16_t *arg0_poly16_t;
--  poly16x4_t arg1_poly16x4_t;
--
--  vst1_p16 (arg0_poly16_t, arg1_poly16x4_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1p64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vst1p64 (void)
--{
--  poly64_t *arg0_poly64_t;
--  poly64x1_t arg1_poly64x1_t;
--
--  vst1_p64 (arg0_poly64_t, arg1_poly64x1_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1p8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1p8 (void)
--{
--  poly8_t *arg0_poly8_t;
--  poly8x8_t arg1_poly8x8_t;
--
--  vst1_p8 (arg0_poly8_t, arg1_poly8x8_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1s16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1s16 (void)
--{
--  int16_t *arg0_int16_t;
--  int16x4_t arg1_int16x4_t;
--
--  vst1_s16 (arg0_int16_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1s32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1s32 (void)
--{
--  int32_t *arg0_int32_t;
--  int32x2_t arg1_int32x2_t;
--
--  vst1_s32 (arg0_int32_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1s64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1s64 (void)
--{
--  int64_t *arg0_int64_t;
--  int64x1_t arg1_int64x1_t;
--
--  vst1_s64 (arg0_int64_t, arg1_int64x1_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1s8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1s8 (void)
--{
--  int8_t *arg0_int8_t;
--  int8x8_t arg1_int8x8_t;
--
--  vst1_s8 (arg0_int8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1u16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1u16 (void)
--{
--  uint16_t *arg0_uint16_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  vst1_u16 (arg0_uint16_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.16\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1u32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1u32 (void)
--{
--  uint32_t *arg0_uint32_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  vst1_u32 (arg0_uint32_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.32\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1u64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1u64 (void)
--{
--  uint64_t *arg0_uint64_t;
--  uint64x1_t arg1_uint64x1_t;
--
--  vst1_u64 (arg0_uint64_t, arg1_uint64x1_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst1u8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst1u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst1u8 (void)
--{
--  uint8_t *arg0_uint8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  vst1_u8 (arg0_uint8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.8\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Q_lanef32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2Q_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2Q_lanef32 (void)
--{
--  float32_t *arg0_float32_t;
--  float32x4x2_t arg1_float32x4x2_t;
--
--  vst2q_lane_f32 (arg0_float32_t, arg1_float32x4x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Q_lanep16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2Q_lanep16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2Q_lanep16 (void)
--{
--  poly16_t *arg0_poly16_t;
--  poly16x8x2_t arg1_poly16x8x2_t;
--
--  vst2q_lane_p16 (arg0_poly16_t, arg1_poly16x8x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Q_lanes16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2Q_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2Q_lanes16 (void)
--{
--  int16_t *arg0_int16_t;
--  int16x8x2_t arg1_int16x8x2_t;
--
--  vst2q_lane_s16 (arg0_int16_t, arg1_int16x8x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Q_lanes32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2Q_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2Q_lanes32 (void)
--{
--  int32_t *arg0_int32_t;
--  int32x4x2_t arg1_int32x4x2_t;
--
--  vst2q_lane_s32 (arg0_int32_t, arg1_int32x4x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Q_laneu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2Q_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2Q_laneu16 (void)
--{
--  uint16_t *arg0_uint16_t;
--  uint16x8x2_t arg1_uint16x8x2_t;
--
--  vst2q_lane_u16 (arg0_uint16_t, arg1_uint16x8x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Q_laneu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2Q_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2Q_laneu32 (void)
--{
--  uint32_t *arg0_uint32_t;
--  uint32x4x2_t arg1_uint32x4x2_t;
--
--  vst2q_lane_u32 (arg0_uint32_t, arg1_uint32x4x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Qf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst2Qf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2Qf32 (void)
--{
--  float32_t *arg0_float32_t;
--  float32x4x2_t arg1_float32x4x2_t;
--
--  vst2q_f32 (arg0_float32_t, arg1_float32x4x2_t);
--}
--
--/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Qp16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst2Qp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2Qp16 (void)
--{
--  poly16_t *arg0_poly16_t;
--  poly16x8x2_t arg1_poly16x8x2_t;
--
--  vst2q_p16 (arg0_poly16_t, arg1_poly16x8x2_t);
--}
--
--/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Qp8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst2Qp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2Qp8 (void)
--{
--  poly8_t *arg0_poly8_t;
--  poly8x16x2_t arg1_poly8x16x2_t;
--
--  vst2q_p8 (arg0_poly8_t, arg1_poly8x16x2_t);
--}
--
--/* { dg-final { scan-assembler "vst2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Qs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst2Qs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2Qs16 (void)
--{
--  int16_t *arg0_int16_t;
--  int16x8x2_t arg1_int16x8x2_t;
--
--  vst2q_s16 (arg0_int16_t, arg1_int16x8x2_t);
--}
--
--/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Qs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst2Qs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2Qs32 (void)
--{
--  int32_t *arg0_int32_t;
--  int32x4x2_t arg1_int32x4x2_t;
--
--  vst2q_s32 (arg0_int32_t, arg1_int32x4x2_t);
--}
--
--/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Qs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst2Qs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2Qs8 (void)
--{
--  int8_t *arg0_int8_t;
--  int8x16x2_t arg1_int8x16x2_t;
--
--  vst2q_s8 (arg0_int8_t, arg1_int8x16x2_t);
--}
--
--/* { dg-final { scan-assembler "vst2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Qu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst2Qu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2Qu16 (void)
--{
--  uint16_t *arg0_uint16_t;
--  uint16x8x2_t arg1_uint16x8x2_t;
--
--  vst2q_u16 (arg0_uint16_t, arg1_uint16x8x2_t);
--}
--
--/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Qu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst2Qu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2Qu32 (void)
--{
--  uint32_t *arg0_uint32_t;
--  uint32x4x2_t arg1_uint32x4x2_t;
--
--  vst2q_u32 (arg0_uint32_t, arg1_uint32x4x2_t);
--}
--
--/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2Qu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst2Qu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2Qu8 (void)
--{
--  uint8_t *arg0_uint8_t;
--  uint8x16x2_t arg1_uint8x16x2_t;
--
--  vst2q_u8 (arg0_uint8_t, arg1_uint8x16x2_t);
--}
--
--/* { dg-final { scan-assembler "vst2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2_lanef32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2_lanef32 (void)
--{
--  float32_t *arg0_float32_t;
--  float32x2x2_t arg1_float32x2x2_t;
--
--  vst2_lane_f32 (arg0_float32_t, arg1_float32x2x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2_lanep16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2_lanep16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2_lanep16 (void)
--{
--  poly16_t *arg0_poly16_t;
--  poly16x4x2_t arg1_poly16x4x2_t;
--
--  vst2_lane_p16 (arg0_poly16_t, arg1_poly16x4x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2_lanep8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2_lanep8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2_lanep8 (void)
--{
--  poly8_t *arg0_poly8_t;
--  poly8x8x2_t arg1_poly8x8x2_t;
--
--  vst2_lane_p8 (arg0_poly8_t, arg1_poly8x8x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2_lanes16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2_lanes16 (void)
--{
--  int16_t *arg0_int16_t;
--  int16x4x2_t arg1_int16x4x2_t;
--
--  vst2_lane_s16 (arg0_int16_t, arg1_int16x4x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2_lanes32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2_lanes32 (void)
--{
--  int32_t *arg0_int32_t;
--  int32x2x2_t arg1_int32x2x2_t;
--
--  vst2_lane_s32 (arg0_int32_t, arg1_int32x2x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2_lanes8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2_lanes8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2_lanes8 (void)
--{
--  int8_t *arg0_int8_t;
--  int8x8x2_t arg1_int8x8x2_t;
--
--  vst2_lane_s8 (arg0_int8_t, arg1_int8x8x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2_laneu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2_laneu16 (void)
--{
--  uint16_t *arg0_uint16_t;
--  uint16x4x2_t arg1_uint16x4x2_t;
--
--  vst2_lane_u16 (arg0_uint16_t, arg1_uint16x4x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2_laneu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2_laneu32 (void)
--{
--  uint32_t *arg0_uint32_t;
--  uint32x2x2_t arg1_uint32x2x2_t;
--
--  vst2_lane_u32 (arg0_uint32_t, arg1_uint32x2x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2_laneu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2_laneu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2_laneu8 (void)
--{
--  uint8_t *arg0_uint8_t;
--  uint8x8x2_t arg1_uint8x8x2_t;
--
--  vst2_lane_u8 (arg0_uint8_t, arg1_uint8x8x2_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2f32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2f32 (void)
--{
--  float32_t *arg0_float32_t;
--  float32x2x2_t arg1_float32x2x2_t;
--
--  vst2_f32 (arg0_float32_t, arg1_float32x2x2_t);
--}
--
--/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2p16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2p16 (void)
--{
--  poly16_t *arg0_poly16_t;
--  poly16x4x2_t arg1_poly16x4x2_t;
--
--  vst2_p16 (arg0_poly16_t, arg1_poly16x4x2_t);
--}
--
--/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2p64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vst2p64 (void)
--{
--  poly64_t *arg0_poly64_t;
--  poly64x1x2_t arg1_poly64x1x2_t;
--
--  vst2_p64 (arg0_poly64_t, arg1_poly64x1x2_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2p8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2p8 (void)
--{
--  poly8_t *arg0_poly8_t;
--  poly8x8x2_t arg1_poly8x8x2_t;
--
--  vst2_p8 (arg0_poly8_t, arg1_poly8x8x2_t);
--}
--
--/* { dg-final { scan-assembler "vst2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2s16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2s16 (void)
--{
--  int16_t *arg0_int16_t;
--  int16x4x2_t arg1_int16x4x2_t;
--
--  vst2_s16 (arg0_int16_t, arg1_int16x4x2_t);
--}
--
--/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2s32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2s32 (void)
--{
--  int32_t *arg0_int32_t;
--  int32x2x2_t arg1_int32x2x2_t;
--
--  vst2_s32 (arg0_int32_t, arg1_int32x2x2_t);
--}
--
--/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2s64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2s64 (void)
--{
--  int64_t *arg0_int64_t;
--  int64x1x2_t arg1_int64x1x2_t;
--
--  vst2_s64 (arg0_int64_t, arg1_int64x1x2_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2s8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2s8 (void)
--{
--  int8_t *arg0_int8_t;
--  int8x8x2_t arg1_int8x8x2_t;
--
--  vst2_s8 (arg0_int8_t, arg1_int8x8x2_t);
--}
--
--/* { dg-final { scan-assembler "vst2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2u16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2u16 (void)
--{
--  uint16_t *arg0_uint16_t;
--  uint16x4x2_t arg1_uint16x4x2_t;
--
--  vst2_u16 (arg0_uint16_t, arg1_uint16x4x2_t);
--}
--
--/* { dg-final { scan-assembler "vst2\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2u32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2u32 (void)
--{
--  uint32_t *arg0_uint32_t;
--  uint32x2x2_t arg1_uint32x2x2_t;
--
--  vst2_u32 (arg0_uint32_t, arg1_uint32x2x2_t);
--}
--
--/* { dg-final { scan-assembler "vst2\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2u64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2u64 (void)
--{
--  uint64_t *arg0_uint64_t;
--  uint64x1x2_t arg1_uint64x1x2_t;
--
--  vst2_u64 (arg0_uint64_t, arg1_uint64x1x2_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst2u8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst2u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst2u8 (void)
--{
--  uint8_t *arg0_uint8_t;
--  uint8x8x2_t arg1_uint8x8x2_t;
--
--  vst2_u8 (arg0_uint8_t, arg1_uint8x8x2_t);
--}
--
--/* { dg-final { scan-assembler "vst2\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Q_lanef32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3Q_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3Q_lanef32 (void)
--{
--  float32_t *arg0_float32_t;
--  float32x4x3_t arg1_float32x4x3_t;
--
--  vst3q_lane_f32 (arg0_float32_t, arg1_float32x4x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Q_lanep16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3Q_lanep16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3Q_lanep16 (void)
--{
--  poly16_t *arg0_poly16_t;
--  poly16x8x3_t arg1_poly16x8x3_t;
--
--  vst3q_lane_p16 (arg0_poly16_t, arg1_poly16x8x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Q_lanes16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3Q_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3Q_lanes16 (void)
--{
--  int16_t *arg0_int16_t;
--  int16x8x3_t arg1_int16x8x3_t;
--
--  vst3q_lane_s16 (arg0_int16_t, arg1_int16x8x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Q_lanes32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3Q_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3Q_lanes32 (void)
--{
--  int32_t *arg0_int32_t;
--  int32x4x3_t arg1_int32x4x3_t;
--
--  vst3q_lane_s32 (arg0_int32_t, arg1_int32x4x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Q_laneu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3Q_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3Q_laneu16 (void)
--{
--  uint16_t *arg0_uint16_t;
--  uint16x8x3_t arg1_uint16x8x3_t;
--
--  vst3q_lane_u16 (arg0_uint16_t, arg1_uint16x8x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Q_laneu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3Q_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3Q_laneu32 (void)
--{
--  uint32_t *arg0_uint32_t;
--  uint32x4x3_t arg1_uint32x4x3_t;
--
--  vst3q_lane_u32 (arg0_uint32_t, arg1_uint32x4x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Qf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst3Qf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3Qf32 (void)
--{
--  float32_t *arg0_float32_t;
--  float32x4x3_t arg1_float32x4x3_t;
--
--  vst3q_f32 (arg0_float32_t, arg1_float32x4x3_t);
--}
--
--/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Qp16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst3Qp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3Qp16 (void)
--{
--  poly16_t *arg0_poly16_t;
--  poly16x8x3_t arg1_poly16x8x3_t;
--
--  vst3q_p16 (arg0_poly16_t, arg1_poly16x8x3_t);
--}
--
--/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Qp8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst3Qp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3Qp8 (void)
--{
--  poly8_t *arg0_poly8_t;
--  poly8x16x3_t arg1_poly8x16x3_t;
--
--  vst3q_p8 (arg0_poly8_t, arg1_poly8x16x3_t);
--}
--
--/* { dg-final { scan-assembler "vst3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Qs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst3Qs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3Qs16 (void)
--{
--  int16_t *arg0_int16_t;
--  int16x8x3_t arg1_int16x8x3_t;
--
--  vst3q_s16 (arg0_int16_t, arg1_int16x8x3_t);
--}
--
--/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Qs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst3Qs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3Qs32 (void)
--{
--  int32_t *arg0_int32_t;
--  int32x4x3_t arg1_int32x4x3_t;
--
--  vst3q_s32 (arg0_int32_t, arg1_int32x4x3_t);
--}
--
--/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Qs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst3Qs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3Qs8 (void)
--{
--  int8_t *arg0_int8_t;
--  int8x16x3_t arg1_int8x16x3_t;
--
--  vst3q_s8 (arg0_int8_t, arg1_int8x16x3_t);
--}
--
--/* { dg-final { scan-assembler "vst3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Qu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst3Qu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3Qu16 (void)
--{
--  uint16_t *arg0_uint16_t;
--  uint16x8x3_t arg1_uint16x8x3_t;
--
--  vst3q_u16 (arg0_uint16_t, arg1_uint16x8x3_t);
--}
--
--/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Qu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst3Qu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3Qu32 (void)
--{
--  uint32_t *arg0_uint32_t;
--  uint32x4x3_t arg1_uint32x4x3_t;
--
--  vst3q_u32 (arg0_uint32_t, arg1_uint32x4x3_t);
--}
--
--/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3Qu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst3Qu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3Qu8 (void)
--{
--  uint8_t *arg0_uint8_t;
--  uint8x16x3_t arg1_uint8x16x3_t;
--
--  vst3q_u8 (arg0_uint8_t, arg1_uint8x16x3_t);
--}
--
--/* { dg-final { scan-assembler "vst3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3_lanef32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3_lanef32 (void)
--{
--  float32_t *arg0_float32_t;
--  float32x2x3_t arg1_float32x2x3_t;
--
--  vst3_lane_f32 (arg0_float32_t, arg1_float32x2x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3_lanep16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3_lanep16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3_lanep16 (void)
--{
--  poly16_t *arg0_poly16_t;
--  poly16x4x3_t arg1_poly16x4x3_t;
--
--  vst3_lane_p16 (arg0_poly16_t, arg1_poly16x4x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3_lanep8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3_lanep8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3_lanep8 (void)
--{
--  poly8_t *arg0_poly8_t;
--  poly8x8x3_t arg1_poly8x8x3_t;
--
--  vst3_lane_p8 (arg0_poly8_t, arg1_poly8x8x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3_lanes16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3_lanes16 (void)
--{
--  int16_t *arg0_int16_t;
--  int16x4x3_t arg1_int16x4x3_t;
--
--  vst3_lane_s16 (arg0_int16_t, arg1_int16x4x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3_lanes32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3_lanes32 (void)
--{
--  int32_t *arg0_int32_t;
--  int32x2x3_t arg1_int32x2x3_t;
--
--  vst3_lane_s32 (arg0_int32_t, arg1_int32x2x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3_lanes8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3_lanes8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3_lanes8 (void)
--{
--  int8_t *arg0_int8_t;
--  int8x8x3_t arg1_int8x8x3_t;
--
--  vst3_lane_s8 (arg0_int8_t, arg1_int8x8x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3_laneu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3_laneu16 (void)
--{
--  uint16_t *arg0_uint16_t;
--  uint16x4x3_t arg1_uint16x4x3_t;
--
--  vst3_lane_u16 (arg0_uint16_t, arg1_uint16x4x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3_laneu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3_laneu32 (void)
--{
--  uint32_t *arg0_uint32_t;
--  uint32x2x3_t arg1_uint32x2x3_t;
--
--  vst3_lane_u32 (arg0_uint32_t, arg1_uint32x2x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3_laneu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3_laneu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3_laneu8 (void)
--{
--  uint8_t *arg0_uint8_t;
--  uint8x8x3_t arg1_uint8x8x3_t;
--
--  vst3_lane_u8 (arg0_uint8_t, arg1_uint8x8x3_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3f32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3f32 (void)
--{
--  float32_t *arg0_float32_t;
--  float32x2x3_t arg1_float32x2x3_t;
--
--  vst3_f32 (arg0_float32_t, arg1_float32x2x3_t);
--}
--
--/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3p16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3p16 (void)
--{
--  poly16_t *arg0_poly16_t;
--  poly16x4x3_t arg1_poly16x4x3_t;
--
--  vst3_p16 (arg0_poly16_t, arg1_poly16x4x3_t);
--}
--
--/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3p64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vst3p64 (void)
--{
--  poly64_t *arg0_poly64_t;
--  poly64x1x3_t arg1_poly64x1x3_t;
--
--  vst3_p64 (arg0_poly64_t, arg1_poly64x1x3_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3p8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3p8 (void)
--{
--  poly8_t *arg0_poly8_t;
--  poly8x8x3_t arg1_poly8x8x3_t;
--
--  vst3_p8 (arg0_poly8_t, arg1_poly8x8x3_t);
--}
--
--/* { dg-final { scan-assembler "vst3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3s16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3s16 (void)
--{
--  int16_t *arg0_int16_t;
--  int16x4x3_t arg1_int16x4x3_t;
--
--  vst3_s16 (arg0_int16_t, arg1_int16x4x3_t);
--}
--
--/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3s32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3s32 (void)
--{
--  int32_t *arg0_int32_t;
--  int32x2x3_t arg1_int32x2x3_t;
--
--  vst3_s32 (arg0_int32_t, arg1_int32x2x3_t);
--}
--
--/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3s64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3s64 (void)
--{
--  int64_t *arg0_int64_t;
--  int64x1x3_t arg1_int64x1x3_t;
--
--  vst3_s64 (arg0_int64_t, arg1_int64x1x3_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3s8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3s8 (void)
--{
--  int8_t *arg0_int8_t;
--  int8x8x3_t arg1_int8x8x3_t;
--
--  vst3_s8 (arg0_int8_t, arg1_int8x8x3_t);
--}
--
--/* { dg-final { scan-assembler "vst3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3u16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3u16 (void)
--{
--  uint16_t *arg0_uint16_t;
--  uint16x4x3_t arg1_uint16x4x3_t;
--
--  vst3_u16 (arg0_uint16_t, arg1_uint16x4x3_t);
--}
--
--/* { dg-final { scan-assembler "vst3\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3u32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3u32 (void)
--{
--  uint32_t *arg0_uint32_t;
--  uint32x2x3_t arg1_uint32x2x3_t;
--
--  vst3_u32 (arg0_uint32_t, arg1_uint32x2x3_t);
--}
--
--/* { dg-final { scan-assembler "vst3\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3u64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3u64 (void)
--{
--  uint64_t *arg0_uint64_t;
--  uint64x1x3_t arg1_uint64x1x3_t;
--
--  vst3_u64 (arg0_uint64_t, arg1_uint64x1x3_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst3u8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst3u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst3u8 (void)
--{
--  uint8_t *arg0_uint8_t;
--  uint8x8x3_t arg1_uint8x8x3_t;
--
--  vst3_u8 (arg0_uint8_t, arg1_uint8x8x3_t);
--}
--
--/* { dg-final { scan-assembler "vst3\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Q_lanef32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4Q_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4Q_lanef32 (void)
--{
--  float32_t *arg0_float32_t;
--  float32x4x4_t arg1_float32x4x4_t;
--
--  vst4q_lane_f32 (arg0_float32_t, arg1_float32x4x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Q_lanep16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4Q_lanep16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4Q_lanep16 (void)
--{
--  poly16_t *arg0_poly16_t;
--  poly16x8x4_t arg1_poly16x8x4_t;
--
--  vst4q_lane_p16 (arg0_poly16_t, arg1_poly16x8x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Q_lanes16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4Q_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4Q_lanes16 (void)
--{
--  int16_t *arg0_int16_t;
--  int16x8x4_t arg1_int16x8x4_t;
--
--  vst4q_lane_s16 (arg0_int16_t, arg1_int16x8x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Q_lanes32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4Q_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4Q_lanes32 (void)
--{
--  int32_t *arg0_int32_t;
--  int32x4x4_t arg1_int32x4x4_t;
--
--  vst4q_lane_s32 (arg0_int32_t, arg1_int32x4x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Q_laneu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4Q_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4Q_laneu16 (void)
--{
--  uint16_t *arg0_uint16_t;
--  uint16x8x4_t arg1_uint16x8x4_t;
--
--  vst4q_lane_u16 (arg0_uint16_t, arg1_uint16x8x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Q_laneu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4Q_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4Q_laneu32 (void)
--{
--  uint32_t *arg0_uint32_t;
--  uint32x4x4_t arg1_uint32x4x4_t;
--
--  vst4q_lane_u32 (arg0_uint32_t, arg1_uint32x4x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Qf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst4Qf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4Qf32 (void)
--{
--  float32_t *arg0_float32_t;
--  float32x4x4_t arg1_float32x4x4_t;
--
--  vst4q_f32 (arg0_float32_t, arg1_float32x4x4_t);
--}
--
--/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Qp16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst4Qp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4Qp16 (void)
--{
--  poly16_t *arg0_poly16_t;
--  poly16x8x4_t arg1_poly16x8x4_t;
--
--  vst4q_p16 (arg0_poly16_t, arg1_poly16x8x4_t);
--}
--
--/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Qp8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst4Qp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4Qp8 (void)
--{
--  poly8_t *arg0_poly8_t;
--  poly8x16x4_t arg1_poly8x16x4_t;
--
--  vst4q_p8 (arg0_poly8_t, arg1_poly8x16x4_t);
--}
--
--/* { dg-final { scan-assembler "vst4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Qs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst4Qs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4Qs16 (void)
--{
--  int16_t *arg0_int16_t;
--  int16x8x4_t arg1_int16x8x4_t;
--
--  vst4q_s16 (arg0_int16_t, arg1_int16x8x4_t);
--}
--
--/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Qs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst4Qs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4Qs32 (void)
--{
--  int32_t *arg0_int32_t;
--  int32x4x4_t arg1_int32x4x4_t;
--
--  vst4q_s32 (arg0_int32_t, arg1_int32x4x4_t);
--}
--
--/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Qs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst4Qs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4Qs8 (void)
--{
--  int8_t *arg0_int8_t;
--  int8x16x4_t arg1_int8x16x4_t;
--
--  vst4q_s8 (arg0_int8_t, arg1_int8x16x4_t);
--}
--
--/* { dg-final { scan-assembler "vst4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Qu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst4Qu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4Qu16 (void)
--{
--  uint16_t *arg0_uint16_t;
--  uint16x8x4_t arg1_uint16x8x4_t;
--
--  vst4q_u16 (arg0_uint16_t, arg1_uint16x8x4_t);
--}
--
--/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Qu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst4Qu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4Qu32 (void)
--{
--  uint32_t *arg0_uint32_t;
--  uint32x4x4_t arg1_uint32x4x4_t;
--
--  vst4q_u32 (arg0_uint32_t, arg1_uint32x4x4_t);
--}
--
--/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4Qu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vst4Qu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4Qu8 (void)
--{
--  uint8_t *arg0_uint8_t;
--  uint8x16x4_t arg1_uint8x16x4_t;
--
--  vst4q_u8 (arg0_uint8_t, arg1_uint8x16x4_t);
--}
--
--/* { dg-final { scan-assembler "vst4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
--/* { dg-final { scan-assembler "vst4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4_lanef32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4_lanef32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4_lanef32 (void)
--{
--  float32_t *arg0_float32_t;
--  float32x2x4_t arg1_float32x2x4_t;
--
--  vst4_lane_f32 (arg0_float32_t, arg1_float32x2x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4_lanep16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4_lanep16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4_lanep16 (void)
--{
--  poly16_t *arg0_poly16_t;
--  poly16x4x4_t arg1_poly16x4x4_t;
--
--  vst4_lane_p16 (arg0_poly16_t, arg1_poly16x4x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4_lanep8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4_lanep8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4_lanep8 (void)
--{
--  poly8_t *arg0_poly8_t;
--  poly8x8x4_t arg1_poly8x8x4_t;
--
--  vst4_lane_p8 (arg0_poly8_t, arg1_poly8x8x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4_lanes16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4_lanes16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4_lanes16 (void)
--{
--  int16_t *arg0_int16_t;
--  int16x4x4_t arg1_int16x4x4_t;
--
--  vst4_lane_s16 (arg0_int16_t, arg1_int16x4x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4_lanes32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4_lanes32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4_lanes32 (void)
--{
--  int32_t *arg0_int32_t;
--  int32x2x4_t arg1_int32x2x4_t;
--
--  vst4_lane_s32 (arg0_int32_t, arg1_int32x2x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4_lanes8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4_lanes8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4_lanes8 (void)
--{
--  int8_t *arg0_int8_t;
--  int8x8x4_t arg1_int8x8x4_t;
--
--  vst4_lane_s8 (arg0_int8_t, arg1_int8x8x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4_laneu16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4_laneu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4_laneu16 (void)
--{
--  uint16_t *arg0_uint16_t;
--  uint16x4x4_t arg1_uint16x4x4_t;
--
--  vst4_lane_u16 (arg0_uint16_t, arg1_uint16x4x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4_laneu32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4_laneu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4_laneu32 (void)
--{
--  uint32_t *arg0_uint32_t;
--  uint32x2x4_t arg1_uint32x2x4_t;
--
--  vst4_lane_u32 (arg0_uint32_t, arg1_uint32x2x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4_laneu8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4_laneu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4_laneu8 (void)
--{
--  uint8_t *arg0_uint8_t;
--  uint8x8x4_t arg1_uint8x8x4_t;
--
--  vst4_lane_u8 (arg0_uint8_t, arg1_uint8x8x4_t, 1);
--}
--
--/* { dg-final { scan-assembler "vst4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+\\\[\[0-9\]+\\\]-\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])|(\[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\], \[dD\]\[0-9\]+\\\[\[0-9\]+\\\]))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4f32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4f32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4f32 (void)
--{
--  float32_t *arg0_float32_t;
--  float32x2x4_t arg1_float32x2x4_t;
--
--  vst4_f32 (arg0_float32_t, arg1_float32x2x4_t);
--}
--
--/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4p16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4p16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4p16 (void)
--{
--  poly16_t *arg0_poly16_t;
--  poly16x4x4_t arg1_poly16x4x4_t;
--
--  vst4_p16 (arg0_poly16_t, arg1_poly16x4x4_t);
--}
--
--/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4p64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4p64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_crypto_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_crypto } */
--
--#include "arm_neon.h"
--
--void test_vst4p64 (void)
--{
--  poly64_t *arg0_poly64_t;
--  poly64x1x4_t arg1_poly64x1x4_t;
--
--  vst4_p64 (arg0_poly64_t, arg1_poly64x1x4_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4p8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4p8 (void)
--{
--  poly8_t *arg0_poly8_t;
--  poly8x8x4_t arg1_poly8x8x4_t;
--
--  vst4_p8 (arg0_poly8_t, arg1_poly8x8x4_t);
--}
--
--/* { dg-final { scan-assembler "vst4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4s16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4s16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4s16 (void)
--{
--  int16_t *arg0_int16_t;
--  int16x4x4_t arg1_int16x4x4_t;
--
--  vst4_s16 (arg0_int16_t, arg1_int16x4x4_t);
--}
--
--/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4s32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4s32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4s32 (void)
--{
--  int32_t *arg0_int32_t;
--  int32x2x4_t arg1_int32x2x4_t;
--
--  vst4_s32 (arg0_int32_t, arg1_int32x2x4_t);
--}
--
--/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4s64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4s64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4s64 (void)
--{
--  int64_t *arg0_int64_t;
--  int64x1x4_t arg1_int64x1x4_t;
--
--  vst4_s64 (arg0_int64_t, arg1_int64x1x4_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4s8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4s8 (void)
--{
--  int8_t *arg0_int8_t;
--  int8x8x4_t arg1_int8x8x4_t;
--
--  vst4_s8 (arg0_int8_t, arg1_int8x8x4_t);
--}
--
--/* { dg-final { scan-assembler "vst4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4u16.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4u16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4u16 (void)
--{
--  uint16_t *arg0_uint16_t;
--  uint16x4x4_t arg1_uint16x4x4_t;
--
--  vst4_u16 (arg0_uint16_t, arg1_uint16x4x4_t);
--}
--
--/* { dg-final { scan-assembler "vst4\.16\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4u32.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4u32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4u32 (void)
--{
--  uint32_t *arg0_uint32_t;
--  uint32x2x4_t arg1_uint32x2x4_t;
--
--  vst4_u32 (arg0_uint32_t, arg1_uint32x2x4_t);
--}
--
--/* { dg-final { scan-assembler "vst4\.32\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4u64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4u64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4u64 (void)
--{
--  uint64_t *arg0_uint64_t;
--  uint64x1x4_t arg1_uint64x1x4_t;
--
--  vst4_u64 (arg0_uint64_t, arg1_uint64x1x4_t);
--}
--
--/* { dg-final { scan-assembler "vst1\.64\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vst4u8.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vst4u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vst4u8 (void)
--{
--  uint8_t *arg0_uint8_t;
--  uint8x8x4_t arg1_uint8x8x4_t;
--
--  vst4_u8 (arg0_uint8_t, arg1_uint8x8x4_t);
--}
--
--/* { dg-final { scan-assembler "vst4\.8\[ 	\]+\\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubQf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubQf32 (void)
--{
--  float32x4_t out_float32x4_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--
--  out_float32x4_t = vsubq_f32 (arg0_float32x4_t, arg1_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vsub\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubQs16 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8_t = vsubq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vsub\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubQs32 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4_t = vsubq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vsub\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubQs64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubQs64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubQs64 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_int64x2_t = vsubq_s64 (arg0_int64x2_t, arg1_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "vsub\.i64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubQs8 (void)
--{
--  int8x16_t out_int8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16_t = vsubq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vsub\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vsubq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vsub\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vsubq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vsub\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubQu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubQu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubQu64 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint64x2_t arg1_uint64x2_t;
--
--  out_uint64x2_t = vsubq_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
--}
--
--/* { dg-final { scan-assembler "vsub\.i64\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vsubq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vsub\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubf32 (void)
--{
--  float32x2_t out_float32x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_float32x2_t = vsub_f32 (arg0_float32x2_t, arg1_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vsub\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubhns16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubhns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubhns16 (void)
--{
--  int8x8_t out_int8x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int8x8_t = vsubhn_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vsubhn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubhns32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubhns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubhns32 (void)
--{
--  int16x4_t out_int16x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int16x4_t = vsubhn_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vsubhn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubhns64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubhns64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubhns64 (void)
--{
--  int32x2_t out_int32x2_t;
--  int64x2_t arg0_int64x2_t;
--  int64x2_t arg1_int64x2_t;
--
--  out_int32x2_t = vsubhn_s64 (arg0_int64x2_t, arg1_int64x2_t);
--}
--
--/* { dg-final { scan-assembler "vsubhn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubhnu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubhnu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubhnu16 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint8x8_t = vsubhn_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vsubhn\.i16\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubhnu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubhnu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubhnu32 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint16x4_t = vsubhn_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vsubhn\.i32\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubhnu64.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubhnu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubhnu64 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint64x2_t arg1_uint64x2_t;
--
--  out_uint32x2_t = vsubhn_u64 (arg0_uint64x2_t, arg1_uint64x2_t);
--}
--
--/* { dg-final { scan-assembler "vsubhn\.i64\[ 	\]+\[dD\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubls16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubls16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubls16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int32x4_t = vsubl_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vsubl\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubls32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubls32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubls32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int64x2_t = vsubl_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vsubl\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubls8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubls8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubls8 (void)
--{
--  int16x8_t out_int16x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int16x8_t = vsubl_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vsubl\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsublu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsublu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsublu16 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint32x4_t = vsubl_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vsubl\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsublu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsublu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsublu32 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint64x2_t = vsubl_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vsubl\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsublu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsublu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsublu8 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint16x8_t = vsubl_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vsubl\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubs16 (void)
--{
--  int16x4_t out_int16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4_t = vsub_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vsub\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubs32 (void)
--{
--  int32x2_t out_int32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2_t = vsub_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vsub\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubs64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vsubs64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubs64 (void)
--{
--  int64x1_t out_int64x1_t;
--  int64x1_t arg0_int64x1_t;
--  int64x1_t arg1_int64x1_t;
--
--  out_int64x1_t = vsub_s64 (arg0_int64x1_t, arg1_int64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubs8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vsub_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vsub\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vsub_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vsub\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vsub_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vsub\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubu64.c
-+++ b/src//dev/null
-@@ -1,19 +0,0 @@
--/* Test the `vsubu64' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubu64 (void)
--{
--  uint64x1_t out_uint64x1_t;
--  uint64x1_t arg0_uint64x1_t;
--  uint64x1_t arg1_uint64x1_t;
--
--  out_uint64x1_t = vsub_u64 (arg0_uint64x1_t, arg1_uint64x1_t);
--}
--
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vsub_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vsub\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubws16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubws16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubws16 (void)
--{
--  int32x4_t out_int32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int32x4_t = vsubw_s16 (arg0_int32x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vsubw\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubws32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubws32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubws32 (void)
--{
--  int64x2_t out_int64x2_t;
--  int64x2_t arg0_int64x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int64x2_t = vsubw_s32 (arg0_int64x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vsubw\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubws8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubws8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubws8 (void)
--{
--  int16x8_t out_int16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int16x8_t = vsubw_s8 (arg0_int16x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vsubw\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubwu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubwu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubwu16 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint32x4_t = vsubw_u16 (arg0_uint32x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vsubw\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubwu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubwu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubwu32 (void)
--{
--  uint64x2_t out_uint64x2_t;
--  uint64x2_t arg0_uint64x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint64x2_t = vsubw_u32 (arg0_uint64x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vsubw\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vsubwu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vsubwu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vsubwu8 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint16x8_t = vsubw_u8 (arg0_uint16x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vsubw\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtbl1p8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtbl1p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtbl1p8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly8x8_t arg0_poly8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_poly8x8_t = vtbl1_p8 (arg0_poly8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtbl\.8\[ 	\]+\[dD\]\[0-9\]+, ((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtbl1s8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtbl1s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtbl1s8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vtbl1_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtbl\.8\[ 	\]+\[dD\]\[0-9\]+, ((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtbl1u8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtbl1u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtbl1u8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vtbl1_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtbl\.8\[ 	\]+\[dD\]\[0-9\]+, ((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtbl2p8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtbl2p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtbl2p8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly8x8x2_t arg0_poly8x8x2_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_poly8x8_t = vtbl2_p8 (arg0_poly8x8x2_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtbl\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtbl2s8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtbl2s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtbl2s8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8x2_t arg0_int8x8x2_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vtbl2_s8 (arg0_int8x8x2_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtbl\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtbl2u8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtbl2u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtbl2u8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8x2_t arg0_uint8x8x2_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vtbl2_u8 (arg0_uint8x8x2_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtbl\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtbl3p8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtbl3p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtbl3p8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly8x8x3_t arg0_poly8x8x3_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_poly8x8_t = vtbl3_p8 (arg0_poly8x8x3_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtbl\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtbl3s8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtbl3s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtbl3s8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8x3_t arg0_int8x8x3_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vtbl3_s8 (arg0_int8x8x3_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtbl\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtbl3u8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtbl3u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtbl3u8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8x3_t arg0_uint8x8x3_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vtbl3_u8 (arg0_uint8x8x3_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtbl\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtbl4p8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtbl4p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtbl4p8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly8x8x4_t arg0_poly8x8x4_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_poly8x8_t = vtbl4_p8 (arg0_poly8x8x4_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtbl\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtbl4s8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtbl4s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtbl4s8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8x4_t arg0_int8x8x4_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8_t = vtbl4_s8 (arg0_int8x8x4_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtbl\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtbl4u8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtbl4u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtbl4u8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8x4_t arg0_uint8x8x4_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vtbl4_u8 (arg0_uint8x8x4_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtbl\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtbx1p8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vtbx1p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtbx1p8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly8x8_t arg0_poly8x8_t;
--  poly8x8_t arg1_poly8x8_t;
--  uint8x8_t arg2_uint8x8_t;
--
--  out_poly8x8_t = vtbx1_p8 (arg0_poly8x8_t, arg1_poly8x8_t, arg2_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtbx\.8\[ 	\]+\[dD\]\[0-9\]+, ((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtbx1s8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vtbx1s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtbx1s8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--  int8x8_t arg2_int8x8_t;
--
--  out_int8x8_t = vtbx1_s8 (arg0_int8x8_t, arg1_int8x8_t, arg2_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtbx\.8\[ 	\]+\[dD\]\[0-9\]+, ((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtbx1u8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vtbx1u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtbx1u8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--  uint8x8_t arg2_uint8x8_t;
--
--  out_uint8x8_t = vtbx1_u8 (arg0_uint8x8_t, arg1_uint8x8_t, arg2_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtbx\.8\[ 	\]+\[dD\]\[0-9\]+, ((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtbx2p8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vtbx2p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtbx2p8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly8x8_t arg0_poly8x8_t;
--  poly8x8x2_t arg1_poly8x8x2_t;
--  uint8x8_t arg2_uint8x8_t;
--
--  out_poly8x8_t = vtbx2_p8 (arg0_poly8x8_t, arg1_poly8x8x2_t, arg2_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtbx\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtbx2s8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vtbx2s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtbx2s8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8x2_t arg1_int8x8x2_t;
--  int8x8_t arg2_int8x8_t;
--
--  out_int8x8_t = vtbx2_s8 (arg0_int8x8_t, arg1_int8x8x2_t, arg2_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtbx\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtbx2u8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vtbx2u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtbx2u8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8x2_t arg1_uint8x8x2_t;
--  uint8x8_t arg2_uint8x8_t;
--
--  out_uint8x8_t = vtbx2_u8 (arg0_uint8x8_t, arg1_uint8x8x2_t, arg2_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtbx\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtbx3p8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vtbx3p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtbx3p8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly8x8_t arg0_poly8x8_t;
--  poly8x8x3_t arg1_poly8x8x3_t;
--  uint8x8_t arg2_uint8x8_t;
--
--  out_poly8x8_t = vtbx3_p8 (arg0_poly8x8_t, arg1_poly8x8x3_t, arg2_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtbx\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtbx3s8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vtbx3s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtbx3s8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8x3_t arg1_int8x8x3_t;
--  int8x8_t arg2_int8x8_t;
--
--  out_int8x8_t = vtbx3_s8 (arg0_int8x8_t, arg1_int8x8x3_t, arg2_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtbx\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtbx3u8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vtbx3u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtbx3u8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8x3_t arg1_uint8x8x3_t;
--  uint8x8_t arg2_uint8x8_t;
--
--  out_uint8x8_t = vtbx3_u8 (arg0_uint8x8_t, arg1_uint8x8x3_t, arg2_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtbx\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtbx4p8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vtbx4p8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtbx4p8 (void)
--{
--  poly8x8_t out_poly8x8_t;
--  poly8x8_t arg0_poly8x8_t;
--  poly8x8x4_t arg1_poly8x8x4_t;
--  uint8x8_t arg2_uint8x8_t;
--
--  out_poly8x8_t = vtbx4_p8 (arg0_poly8x8_t, arg1_poly8x8x4_t, arg2_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtbx\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtbx4s8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vtbx4s8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtbx4s8 (void)
--{
--  int8x8_t out_int8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8x4_t arg1_int8x8x4_t;
--  int8x8_t arg2_int8x8_t;
--
--  out_int8x8_t = vtbx4_s8 (arg0_int8x8_t, arg1_int8x8x4_t, arg2_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtbx\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtbx4u8.c
-+++ b/src//dev/null
-@@ -1,21 +0,0 @@
--/* Test the `vtbx4u8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtbx4u8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8x4_t arg1_uint8x8x4_t;
--  uint8x8_t arg2_uint8x8_t;
--
--  out_uint8x8_t = vtbx4_u8 (arg0_uint8x8_t, arg1_uint8x8x4_t, arg2_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtbx\.8\[ 	\]+\[dD\]\[0-9\]+, \\\{((\[dD\]\[0-9\]+-\[dD\]\[0-9\]+)|(\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+))\\\}, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnQf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtrnQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtrnQf32 (void)
--{
--  float32x4x2_t out_float32x4x2_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--
--  out_float32x4x2_t = vtrnq_f32 (arg0_float32x4_t, arg1_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vtrn\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnQp16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtrnQp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtrnQp16 (void)
--{
--  poly16x8x2_t out_poly16x8x2_t;
--  poly16x8_t arg0_poly16x8_t;
--  poly16x8_t arg1_poly16x8_t;
--
--  out_poly16x8x2_t = vtrnq_p16 (arg0_poly16x8_t, arg1_poly16x8_t);
--}
--
--/* { dg-final { scan-assembler "vtrn\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnQp8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtrnQp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtrnQp8 (void)
--{
--  poly8x16x2_t out_poly8x16x2_t;
--  poly8x16_t arg0_poly8x16_t;
--  poly8x16_t arg1_poly8x16_t;
--
--  out_poly8x16x2_t = vtrnq_p8 (arg0_poly8x16_t, arg1_poly8x16_t);
--}
--
--/* { dg-final { scan-assembler "vtrn\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtrnQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtrnQs16 (void)
--{
--  int16x8x2_t out_int16x8x2_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8x2_t = vtrnq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vtrn\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtrnQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtrnQs32 (void)
--{
--  int32x4x2_t out_int32x4x2_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4x2_t = vtrnq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vtrn\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtrnQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtrnQs8 (void)
--{
--  int8x16x2_t out_int8x16x2_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16x2_t = vtrnq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vtrn\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtrnQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtrnQu16 (void)
--{
--  uint16x8x2_t out_uint16x8x2_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8x2_t = vtrnq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vtrn\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtrnQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtrnQu32 (void)
--{
--  uint32x4x2_t out_uint32x4x2_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4x2_t = vtrnq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vtrn\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtrnQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtrnQu8 (void)
--{
--  uint8x16x2_t out_uint8x16x2_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16x2_t = vtrnq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vtrn\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtrnf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtrnf32 (void)
--{
--  float32x2x2_t out_float32x2x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_float32x2x2_t = vtrn_f32 (arg0_float32x2_t, arg1_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnp16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtrnp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtrnp16 (void)
--{
--  poly16x4x2_t out_poly16x4x2_t;
--  poly16x4_t arg0_poly16x4_t;
--  poly16x4_t arg1_poly16x4_t;
--
--  out_poly16x4x2_t = vtrn_p16 (arg0_poly16x4_t, arg1_poly16x4_t);
--}
--
--/* { dg-final { scan-assembler "vtrn\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnp8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtrnp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtrnp8 (void)
--{
--  poly8x8x2_t out_poly8x8x2_t;
--  poly8x8_t arg0_poly8x8_t;
--  poly8x8_t arg1_poly8x8_t;
--
--  out_poly8x8x2_t = vtrn_p8 (arg0_poly8x8_t, arg1_poly8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtrn\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtrns16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtrns16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtrns16 (void)
--{
--  int16x4x2_t out_int16x4x2_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4x2_t = vtrn_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vtrn\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtrns32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtrns32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtrns32 (void)
--{
--  int32x2x2_t out_int32x2x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2x2_t = vtrn_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtrns8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtrns8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtrns8 (void)
--{
--  int8x8x2_t out_int8x8x2_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8x2_t = vtrn_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtrn\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtrnu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtrnu16 (void)
--{
--  uint16x4x2_t out_uint16x4x2_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4x2_t = vtrn_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vtrn\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtrnu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtrnu32 (void)
--{
--  uint32x2x2_t out_uint32x2x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2x2_t = vtrn_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtrnu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtrnu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtrnu8 (void)
--{
--  uint8x8x2_t out_uint8x8x2_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8x2_t = vtrn_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtrn\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtstQp8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtstQp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtstQp8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  poly8x16_t arg0_poly8x16_t;
--  poly8x16_t arg1_poly8x16_t;
--
--  out_uint8x16_t = vtstq_p8 (arg0_poly8x16_t, arg1_poly8x16_t);
--}
--
--/* { dg-final { scan-assembler "vtst\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtstQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtstQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtstQs16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_uint16x8_t = vtstq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vtst\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtstQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtstQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtstQs32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_uint32x4_t = vtstq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vtst\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtstQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtstQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtstQs8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_uint8x16_t = vtstq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vtst\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtstQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtstQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtstQu16 (void)
--{
--  uint16x8_t out_uint16x8_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8_t = vtstq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vtst\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtstQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtstQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtstQu32 (void)
--{
--  uint32x4_t out_uint32x4_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4_t = vtstq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vtst\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtstQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtstQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtstQu8 (void)
--{
--  uint8x16_t out_uint8x16_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16_t = vtstq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vtst\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtstp8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtstp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtstp8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  poly8x8_t arg0_poly8x8_t;
--  poly8x8_t arg1_poly8x8_t;
--
--  out_uint8x8_t = vtst_p8 (arg0_poly8x8_t, arg1_poly8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtst\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtsts16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtsts16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtsts16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_uint16x4_t = vtst_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vtst\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtsts32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtsts32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtsts32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_uint32x2_t = vtst_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vtst\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtsts8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtsts8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtsts8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_uint8x8_t = vtst_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtst\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtstu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtstu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtstu16 (void)
--{
--  uint16x4_t out_uint16x4_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4_t = vtst_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vtst\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtstu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtstu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtstu32 (void)
--{
--  uint32x2_t out_uint32x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2_t = vtst_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vtst\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vtstu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vtstu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vtstu8 (void)
--{
--  uint8x8_t out_uint8x8_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8_t = vtst_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vtst\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpQf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vuzpQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vuzpQf32 (void)
--{
--  float32x4x2_t out_float32x4x2_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--
--  out_float32x4x2_t = vuzpq_f32 (arg0_float32x4_t, arg1_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpQp16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vuzpQp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vuzpQp16 (void)
--{
--  poly16x8x2_t out_poly16x8x2_t;
--  poly16x8_t arg0_poly16x8_t;
--  poly16x8_t arg1_poly16x8_t;
--
--  out_poly16x8x2_t = vuzpq_p16 (arg0_poly16x8_t, arg1_poly16x8_t);
--}
--
--/* { dg-final { scan-assembler "vuzp\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpQp8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vuzpQp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vuzpQp8 (void)
--{
--  poly8x16x2_t out_poly8x16x2_t;
--  poly8x16_t arg0_poly8x16_t;
--  poly8x16_t arg1_poly8x16_t;
--
--  out_poly8x16x2_t = vuzpq_p8 (arg0_poly8x16_t, arg1_poly8x16_t);
--}
--
--/* { dg-final { scan-assembler "vuzp\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vuzpQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vuzpQs16 (void)
--{
--  int16x8x2_t out_int16x8x2_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8x2_t = vuzpq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vuzp\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vuzpQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vuzpQs32 (void)
--{
--  int32x4x2_t out_int32x4x2_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4x2_t = vuzpq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vuzpQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vuzpQs8 (void)
--{
--  int8x16x2_t out_int8x16x2_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16x2_t = vuzpq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vuzp\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vuzpQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vuzpQu16 (void)
--{
--  uint16x8x2_t out_uint16x8x2_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8x2_t = vuzpq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vuzp\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vuzpQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vuzpQu32 (void)
--{
--  uint32x4x2_t out_uint32x4x2_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4x2_t = vuzpq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vuzpQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vuzpQu8 (void)
--{
--  uint8x16x2_t out_uint8x16x2_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16x2_t = vuzpq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vuzp\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vuzpf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vuzpf32 (void)
--{
--  float32x2x2_t out_float32x2x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_float32x2x2_t = vuzp_f32 (arg0_float32x2_t, arg1_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpp16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vuzpp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vuzpp16 (void)
--{
--  poly16x4x2_t out_poly16x4x2_t;
--  poly16x4_t arg0_poly16x4_t;
--  poly16x4_t arg1_poly16x4_t;
--
--  out_poly16x4x2_t = vuzp_p16 (arg0_poly16x4_t, arg1_poly16x4_t);
--}
--
--/* { dg-final { scan-assembler "vuzp\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpp8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vuzpp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vuzpp8 (void)
--{
--  poly8x8x2_t out_poly8x8x2_t;
--  poly8x8_t arg0_poly8x8_t;
--  poly8x8_t arg1_poly8x8_t;
--
--  out_poly8x8x2_t = vuzp_p8 (arg0_poly8x8_t, arg1_poly8x8_t);
--}
--
--/* { dg-final { scan-assembler "vuzp\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vuzps16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vuzps16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vuzps16 (void)
--{
--  int16x4x2_t out_int16x4x2_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4x2_t = vuzp_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vuzp\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vuzps32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vuzps32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vuzps32 (void)
--{
--  int32x2x2_t out_int32x2x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2x2_t = vuzp_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vuzps8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vuzps8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vuzps8 (void)
--{
--  int8x8x2_t out_int8x8x2_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8x2_t = vuzp_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vuzp\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vuzpu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vuzpu16 (void)
--{
--  uint16x4x2_t out_uint16x4x2_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4x2_t = vuzp_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vuzp\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vuzpu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vuzpu32 (void)
--{
--  uint32x2x2_t out_uint32x2x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2x2_t = vuzp_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vuzpu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vuzpu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vuzpu8 (void)
--{
--  uint8x8x2_t out_uint8x8x2_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8x2_t = vuzp_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vuzp\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vzipQf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vzipQf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vzipQf32 (void)
--{
--  float32x4x2_t out_float32x4x2_t;
--  float32x4_t arg0_float32x4_t;
--  float32x4_t arg1_float32x4_t;
--
--  out_float32x4x2_t = vzipq_f32 (arg0_float32x4_t, arg1_float32x4_t);
--}
--
--/* { dg-final { scan-assembler "vzip\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vzipQp16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vzipQp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vzipQp16 (void)
--{
--  poly16x8x2_t out_poly16x8x2_t;
--  poly16x8_t arg0_poly16x8_t;
--  poly16x8_t arg1_poly16x8_t;
--
--  out_poly16x8x2_t = vzipq_p16 (arg0_poly16x8_t, arg1_poly16x8_t);
--}
--
--/* { dg-final { scan-assembler "vzip\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vzipQp8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vzipQp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vzipQp8 (void)
--{
--  poly8x16x2_t out_poly8x16x2_t;
--  poly8x16_t arg0_poly8x16_t;
--  poly8x16_t arg1_poly8x16_t;
--
--  out_poly8x16x2_t = vzipq_p8 (arg0_poly8x16_t, arg1_poly8x16_t);
--}
--
--/* { dg-final { scan-assembler "vzip\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vzipQs16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vzipQs16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vzipQs16 (void)
--{
--  int16x8x2_t out_int16x8x2_t;
--  int16x8_t arg0_int16x8_t;
--  int16x8_t arg1_int16x8_t;
--
--  out_int16x8x2_t = vzipq_s16 (arg0_int16x8_t, arg1_int16x8_t);
--}
--
--/* { dg-final { scan-assembler "vzip\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vzipQs32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vzipQs32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vzipQs32 (void)
--{
--  int32x4x2_t out_int32x4x2_t;
--  int32x4_t arg0_int32x4_t;
--  int32x4_t arg1_int32x4_t;
--
--  out_int32x4x2_t = vzipq_s32 (arg0_int32x4_t, arg1_int32x4_t);
--}
--
--/* { dg-final { scan-assembler "vzip\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vzipQs8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vzipQs8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vzipQs8 (void)
--{
--  int8x16x2_t out_int8x16x2_t;
--  int8x16_t arg0_int8x16_t;
--  int8x16_t arg1_int8x16_t;
--
--  out_int8x16x2_t = vzipq_s8 (arg0_int8x16_t, arg1_int8x16_t);
--}
--
--/* { dg-final { scan-assembler "vzip\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vzipQu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vzipQu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vzipQu16 (void)
--{
--  uint16x8x2_t out_uint16x8x2_t;
--  uint16x8_t arg0_uint16x8_t;
--  uint16x8_t arg1_uint16x8_t;
--
--  out_uint16x8x2_t = vzipq_u16 (arg0_uint16x8_t, arg1_uint16x8_t);
--}
--
--/* { dg-final { scan-assembler "vzip\.16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vzipQu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vzipQu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vzipQu32 (void)
--{
--  uint32x4x2_t out_uint32x4x2_t;
--  uint32x4_t arg0_uint32x4_t;
--  uint32x4_t arg1_uint32x4_t;
--
--  out_uint32x4x2_t = vzipq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
--}
--
--/* { dg-final { scan-assembler "vzip\.32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vzipQu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vzipQu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vzipQu8 (void)
--{
--  uint8x16x2_t out_uint8x16x2_t;
--  uint8x16_t arg0_uint8x16_t;
--  uint8x16_t arg1_uint8x16_t;
--
--  out_uint8x16x2_t = vzipq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
--}
--
--/* { dg-final { scan-assembler "vzip\.8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vzipf32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vzipf32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vzipf32 (void)
--{
--  float32x2x2_t out_float32x2x2_t;
--  float32x2_t arg0_float32x2_t;
--  float32x2_t arg1_float32x2_t;
--
--  out_float32x2x2_t = vzip_f32 (arg0_float32x2_t, arg1_float32x2_t);
--}
--
--/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vzipp16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vzipp16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vzipp16 (void)
--{
--  poly16x4x2_t out_poly16x4x2_t;
--  poly16x4_t arg0_poly16x4_t;
--  poly16x4_t arg1_poly16x4_t;
--
--  out_poly16x4x2_t = vzip_p16 (arg0_poly16x4_t, arg1_poly16x4_t);
--}
--
--/* { dg-final { scan-assembler "vzip\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vzipp8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vzipp8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vzipp8 (void)
--{
--  poly8x8x2_t out_poly8x8x2_t;
--  poly8x8_t arg0_poly8x8_t;
--  poly8x8_t arg1_poly8x8_t;
--
--  out_poly8x8x2_t = vzip_p8 (arg0_poly8x8_t, arg1_poly8x8_t);
--}
--
--/* { dg-final { scan-assembler "vzip\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vzips16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vzips16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vzips16 (void)
--{
--  int16x4x2_t out_int16x4x2_t;
--  int16x4_t arg0_int16x4_t;
--  int16x4_t arg1_int16x4_t;
--
--  out_int16x4x2_t = vzip_s16 (arg0_int16x4_t, arg1_int16x4_t);
--}
--
--/* { dg-final { scan-assembler "vzip\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vzips32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vzips32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vzips32 (void)
--{
--  int32x2x2_t out_int32x2x2_t;
--  int32x2_t arg0_int32x2_t;
--  int32x2_t arg1_int32x2_t;
--
--  out_int32x2x2_t = vzip_s32 (arg0_int32x2_t, arg1_int32x2_t);
--}
--
--/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vzips8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vzips8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vzips8 (void)
--{
--  int8x8x2_t out_int8x8x2_t;
--  int8x8_t arg0_int8x8_t;
--  int8x8_t arg1_int8x8_t;
--
--  out_int8x8x2_t = vzip_s8 (arg0_int8x8_t, arg1_int8x8_t);
--}
--
--/* { dg-final { scan-assembler "vzip\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vzipu16.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vzipu16' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vzipu16 (void)
--{
--  uint16x4x2_t out_uint16x4x2_t;
--  uint16x4_t arg0_uint16x4_t;
--  uint16x4_t arg1_uint16x4_t;
--
--  out_uint16x4x2_t = vzip_u16 (arg0_uint16x4_t, arg1_uint16x4_t);
--}
--
--/* { dg-final { scan-assembler "vzip\.16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vzipu32.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vzipu32' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vzipu32 (void)
--{
--  uint32x2x2_t out_uint32x2x2_t;
--  uint32x2_t arg0_uint32x2_t;
--  uint32x2_t arg1_uint32x2_t;
--
--  out_uint32x2x2_t = vzip_u32 (arg0_uint32x2_t, arg1_uint32x2_t);
--}
--
--/* { dg-final { scan-assembler "vuzp\.32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- a/src/gcc/testsuite/gcc.target/arm/neon/vzipu8.c
-+++ b/src//dev/null
-@@ -1,20 +0,0 @@
--/* Test the `vzipu8' ARM Neon intrinsic.  */
--/* This file was autogenerated by neon-testgen.  */
--
--/* { dg-do assemble } */
--/* { dg-require-effective-target arm_neon_ok } */
--/* { dg-options "-save-temps -O0" } */
--/* { dg-add-options arm_neon } */
--
--#include "arm_neon.h"
--
--void test_vzipu8 (void)
--{
--  uint8x8x2_t out_uint8x8x2_t;
--  uint8x8_t arg0_uint8x8_t;
--  uint8x8_t arg1_uint8x8_t;
--
--  out_uint8x8x2_t = vzip_u8 (arg0_uint8x8_t, arg1_uint8x8_t);
--}
--
--/* { dg-final { scan-assembler "vzip\.8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/optional_thumb-1.c
-@@ -0,0 +1,7 @@
-+/* { dg-do compile { target { ! default_mode } } } */
-+/* { dg-skip-if "-marm/-mthumb/-march/-mcpu given" { *-*-* } { "-marm" "-mthumb" "-march=*" "-mcpu=*" } } */
-+/* { dg-options "-march=armv6-m" } */
-+
-+/* Check that -mthumb is not needed when compiling for a Thumb-only target.  */
-+
-+int foo;
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/optional_thumb-2.c
-@@ -0,0 +1,7 @@
-+/* { dg-do compile { target { ! default_mode } } } */
-+/* { dg-skip-if "-marm/-mthumb/-march/-mcpu given" { *-*-* } { "-marm" "-mthumb" "-march=*" "-mcpu=*" } } */
-+/* { dg-options "-mcpu=cortex-m4" } */
-+
-+/* Check that -mthumb is not needed when compiling for a Thumb-only target.  */
-+
-+int foo;
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/optional_thumb-3.c
-@@ -0,0 +1,9 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_cortex_m } */
-+/* { dg-skip-if "-mthumb given" { *-*-* } { "-mthumb" } } */
-+/* { dg-options "-marm" } */
-+/* { dg-error "target CPU does not support ARM mode" "missing error with -marm on Thumb-only targets" { target *-*-* } 0 } */
-+
-+/* Check that -marm gives an error when compiling for a Thumb-only target.  */
-+
-+int foo;
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/polytypes.c
-@@ -0,0 +1,48 @@
-+/* Check that NEON polynomial vector types are suitably incompatible with
-+   integer vector types of the same layout.  */
-+
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_neon_ok } */
-+/* { dg-add-options arm_neon } */
-+
-+#include <arm_neon.h>
-+
-+void s64_8 (int8x8_t a) {}
-+void u64_8 (uint8x8_t a) {}
-+void p64_8 (poly8x8_t a) {}
-+void s64_16 (int16x4_t a) {}
-+void u64_16 (uint16x4_t a) {}
-+void p64_16 (poly16x4_t a) {}
-+
-+void s128_8 (int8x16_t a) {}
-+void u128_8 (uint8x16_t a) {}
-+void p128_8 (poly8x16_t a) {}
-+void s128_16 (int16x8_t a) {}
-+void u128_16 (uint16x8_t a) {}
-+void p128_16 (poly16x8_t a) {}
-+
-+void foo ()
-+{
-+  poly8x8_t v64_8;
-+  poly16x4_t v64_16;
-+  poly8x16_t v128_8;
-+  poly16x8_t v128_16;
-+
-+  s64_8 (v64_8); /* { dg-message "use -flax-vector-conversions" } */
-+  /* { dg-error "incompatible type for argument 1 of 's64_8'" "" { target *-*-* } 31 } */
-+  u64_8 (v64_8); /* { dg-error "incompatible type for argument 1 of 'u64_8'" } */
-+  p64_8 (v64_8);
-+
-+  s64_16 (v64_16); /* { dg-error "incompatible type for argument 1 of 's64_16'" } */
-+  u64_16 (v64_16); /* { dg-error "incompatible type for argument 1 of 'u64_16'" } */
-+  p64_16 (v64_16);
-+
-+  s128_8 (v128_8); /* { dg-error "incompatible type for argument 1 of 's128_8'" } */
-+  u128_8 (v128_8); /* { dg-error "incompatible type for argument 1 of 'u128_8'" } */
-+  p128_8 (v128_8);
-+
-+  s128_16 (v128_16); /* { dg-error "incompatible type for argument 1 of 's128_16'" } */
-+  u128_16 (v128_16); /* { dg-error "incompatible type for argument 1 of 'u128_16'" } */
-+  p128_16 (v128_16);
-+}
-+/* { dg-message "note: expected '\[^'\n\]*' but argument is of type '\[^'\n\]*'" "note: expected" { target *-*-* } 0 } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/pr37780_1.c
-@@ -0,0 +1,48 @@
-+/* Test that we can remove the conditional move due to CLZ
-+   being defined at zero.  */
-+
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_arch_v6t2_ok } */
-+/* { dg-options "-O2" } */
-+/* { dg-add-options arm_arch_v6t2 } */
-+
-+int
-+fooctz (int i)
-+{
-+  return (i == 0) ? 32 : __builtin_ctz (i);
-+}
-+
-+int
-+fooctz2 (int i)
-+{
-+  return (i != 0) ? __builtin_ctz (i) : 32;
-+}
-+
-+unsigned int
-+fooctz3 (unsigned int i)
-+{
-+  return (i > 0) ?  __builtin_ctz (i) : 32;
-+}
-+
-+/* { dg-final { scan-assembler-times "rbit\t*" 3 } } */
-+
-+int
-+fooclz (int i)
-+{
-+  return (i == 0) ? 32 : __builtin_clz (i);
-+}
-+
-+int
-+fooclz2 (int i)
-+{
-+  return (i != 0) ? __builtin_clz (i) : 32;
-+}
-+
-+unsigned int
-+fooclz3 (unsigned int i)
-+{
-+  return (i > 0) ? __builtin_clz (i) : 32;
-+}
-+
-+/* { dg-final { scan-assembler-times "clz\t" 6 } } */
-+/* { dg-final { scan-assembler-not "cmp\t.*0" } } */
---- a/src/gcc/testsuite/gcc.target/arm/pr42574.c
-+++ b/src/gcc/testsuite/gcc.target/arm/pr42574.c
-@@ -1,5 +1,5 @@
-+/* { dg-do compile { target { arm_thumb1_ok && { ! arm_thumb1_movt_ok } } } } */
- /* { dg-options "-mthumb -Os -fpic" }  */
--/* { dg-require-effective-target arm_thumb1_ok } */
- /* { dg-require-effective-target fpic } */
- /* Make sure the address of glob.c is calculated only once and using
-    a logical shift for the offset (200<<1).  */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/pr51534.c
-@@ -0,0 +1,83 @@
-+/* Test the vector comparison intrinsics when comparing to immediate zero.
-+   */
-+
-+/* { dg-do assemble } */
-+/* { dg-require-effective-target arm_neon_ok } */
-+/* { dg-options "-save-temps -mfloat-abi=hard -O3" } */
-+/* { dg-add-options arm_neon } */
-+
-+#include <arm_neon.h>
-+
-+#define GEN_TEST(T, D, C, R) \
-+  R test_##C##_##T (T a) { return C (a, D (0)); }
-+
-+#define GEN_DOUBLE_TESTS(S, T, C) \
-+  GEN_TEST (T, vdup_n_s##S, C##_s##S, u##T) \
-+  GEN_TEST (u##T, vdup_n_u##S, C##_u##S, u##T) 
-+
-+#define GEN_QUAD_TESTS(S, T, C) \
-+  GEN_TEST (T, vdupq_n_s##S, C##q_s##S, u##T) \
-+  GEN_TEST (u##T, vdupq_n_u##S, C##q_u##S, u##T) 
-+
-+#define GEN_COND_TESTS(C) \
-+  GEN_DOUBLE_TESTS (8, int8x8_t, C) \
-+  GEN_DOUBLE_TESTS (16, int16x4_t, C) \
-+  GEN_DOUBLE_TESTS (32, int32x2_t, C) \
-+  GEN_QUAD_TESTS (8, int8x16_t, C) \
-+  GEN_QUAD_TESTS (16, int16x8_t, C) \
-+  GEN_QUAD_TESTS (32, int32x4_t, C)
-+
-+GEN_COND_TESTS(vcgt)
-+GEN_COND_TESTS(vcge)
-+GEN_COND_TESTS(vclt)
-+GEN_COND_TESTS(vcle)
-+GEN_COND_TESTS(vceq)
-+
-+/* Scan for expected outputs.  */
-+/* { dg-final { scan-assembler "vcgt\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
-+/* { dg-final { scan-assembler-times "vcgt\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+" 2 } } */
-+/* { dg-final { scan-assembler "vcgt\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
-+/* { dg-final { scan-assembler-times "vcgt\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+" 2 } } */
-+/* { dg-final { scan-assembler "vcgt\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
-+/* { dg-final { scan-assembler-times "vcgt\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+" 2 } } */
-+/* { dg-final { scan-assembler "vcgt\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
-+/* { dg-final { scan-assembler-times "vcgt\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+" 2 } } */
-+/* { dg-final { scan-assembler "vcgt\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
-+/* { dg-final { scan-assembler-times "vcgt\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+" 2 } } */
-+/* { dg-final { scan-assembler "vcgt\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
-+/* { dg-final { scan-assembler-times "vcgt\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+" 2 } } */
-+/* { dg-final { scan-assembler "vcge\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
-+/* { dg-final { scan-assembler-times "vcge\.u8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+" 2 } } */
-+/* { dg-final { scan-assembler "vcge\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
-+/* { dg-final { scan-assembler-times "vcge\.u16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+" 2 } } */
-+/* { dg-final { scan-assembler "vcge\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
-+/* { dg-final { scan-assembler-times "vcge\.u32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+" 2 } } */
-+/* { dg-final { scan-assembler "vcge\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
-+/* { dg-final { scan-assembler-times "vcge\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+" 2 } } */
-+/* { dg-final { scan-assembler "vcge\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
-+/* { dg-final { scan-assembler-times "vcge\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+" 2 } } */
-+/* { dg-final { scan-assembler "vcge\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
-+/* { dg-final { scan-assembler-times "vcge\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+" 2 } } */
-+/* { dg-final { scan-assembler "vclt\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
-+/* { dg-final { scan-assembler "vclt\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
-+/* { dg-final { scan-assembler "vclt\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
-+/* { dg-final { scan-assembler "vclt\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
-+/* { dg-final { scan-assembler "vclt\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
-+/* { dg-final { scan-assembler "vclt\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
-+/* { dg-final { scan-assembler "vcle\.s8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
-+/* { dg-final { scan-assembler "vcle\.s16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
-+/* { dg-final { scan-assembler "vcle\.s32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" } } */
-+/* { dg-final { scan-assembler "vcle\.s8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
-+/* { dg-final { scan-assembler "vcle\.s16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
-+/* { dg-final { scan-assembler "vcle\.s32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" } } */
-+/* { dg-final { scan-assembler-times "vceq\.i8\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" 2 } } */
-+/* { dg-final { scan-assembler-times "vceq\.i16\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" 2 } } */
-+/* { dg-final { scan-assembler-times "vceq\.i32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, #0" 2 } } */
-+/* { dg-final { scan-assembler-times "vceq\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" 2 } } */
-+/* { dg-final { scan-assembler-times "vceq\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" 2 } } */
-+/* { dg-final { scan-assembler-times "vceq\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" 2 } } */
-+
-+/* And ensure we don't have unexpected output too.  */
-+/* { dg-final { scan-assembler-not "vc\[gl\]\[te\]\.u\[0-9\]+\[ 	\]+\[qQdD\]\[0-9\]+, \[qQdD\]\[0-9\]+, #0" } } */
-+
-+/* Tidy up.  */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/pr79145.c
-@@ -0,0 +1,16 @@
-+/* { dg-do compile } */
-+/* { dg-skip-if "Test is specific to the iWMMXt" { arm*-*-* } { "-mcpu=*" } { "-mcpu=iwmmxt" } } */
-+/* { dg-skip-if "Test is specific to the iWMMXt" { arm*-*-* } { "-mabi=*" } { "-mabi=iwmmxt" } } */
-+/* { dg-skip-if "Test is specific to the iWMMXt" { arm*-*-* } { "-march=*" } { "-march=iwmmxt" } } */
-+/* { dg-skip-if "Test is specific to ARM mode" { arm*-*-* } { "-mthumb" } { "" } } */
-+/* { dg-require-effective-target arm32 } */
-+/* { dg-require-effective-target arm_iwmmxt_ok } */
-+/* { dg-options "-mcpu=iwmmxt" } */
-+
-+int
-+main (void)
-+{
-+  volatile long long t1;
-+  t1 ^= 0x55;
-+  return 0;
-+}
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/short-vfp-1.c
-@@ -0,0 +1,45 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_vfp_ok }
-+/* { dg-options "-mfpu=vfp" } */
-+
-+int
-+test_sisf (float x)
-+{
-+  return (int)x;
-+}
-+
-+short
-+test_hisf (float x)
-+{
-+  return (short)x;
-+}
-+
-+float
-+test_sfsi (int x)
-+{
-+  return (float)x;
-+}
-+
-+float
-+test_sfhi (short x)
-+{
-+  return (float)x;
-+}
-+
-+short
-+test_hisi (int x)
-+{
-+  return (short)x;
-+}
-+
-+int
-+test_sihi (short x)
-+{
-+  return (int)x;
-+}
-+
-+/* {dg-final { scan-assembler-times {vcvt\.s32\.f32\ts[0-9]+,s[0-9]+} 2 }} */
-+/* {dg-final { scan-assembler-times {vcvt\.f32\.s32\ts[0-9]+,s[0-9]+} 2 }} */
-+/* {dg-final { scan-assembler-times {vmov\tr[0-9]+,s[0-9]+} 2 }} */
-+/* {dg-final { scan-assembler-times {vmov\ts[0-9]+,r[0-9]+} 2 }} */
-+/* {dg-final { scan-assembler-times {sxth\tr[0-9]+,r[0-9]+} 2 }} */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/simd/vmaxnm_f32_1.c
-@@ -0,0 +1,159 @@
-+/* Test the `vmaxnmf32' ARM Neon intrinsic.  */
-+
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_neon_hw } */
-+/* { dg-options "-save-temps -O3 -march=armv8-a" } */
-+/* { dg-add-options arm_v8_neon } */
-+
-+#include "arm_neon.h"
-+
-+extern void abort ();
-+
-+void __attribute__ ((noinline))
-+test_vmaxnm_f32__regular_input1 ()
-+{
-+  float32_t a1[] = {1,2};
-+  float32_t b1[] = {3,4};
-+  float32x2_t a = vld1_f32 (a1);
-+  float32x2_t b = vld1_f32 (b1);
-+  float32x2_t c = vmaxnm_f32 (a, b);
-+  float32_t actual[2];
-+  vst1_f32 (actual, c);
-+
-+  for (int i = 0; i < 2; ++i)
-+    if (actual[i] != b1[i])
-+      abort ();
-+}
-+
-+void __attribute__ ((noinline))
-+test_vmaxnm_f32__regular_input2 ()
-+{
-+  float32_t a1[] = {3,2};
-+  float32_t b1[] = {1,4};
-+  float32_t e[] = {3,4};
-+  float32x2_t a = vld1_f32 (a1);
-+  float32x2_t b = vld1_f32 (b1);
-+  float32x2_t c = vmaxnm_f32 (a, b);
-+  float32_t actual[2];
-+  vst1_f32 (actual, c);
-+
-+  for (int i = 0; i < 2; ++i)
-+    if (actual[i] != e[i])
-+      abort ();
-+}
-+
-+void __attribute__ ((noinline))
-+test_vmaxnm_f32__quiet_NaN_one_arg ()
-+{
-+  /* When given a quiet NaN, vmaxnm returns the other operand.
-+     In this test case we have NaNs in only one operand.  */
-+  float32_t n = __builtin_nanf ("");
-+  float32_t a1[] = {1,2};
-+  float32_t b1[] = {n,n};
-+  float32_t e[] = {1,2};
-+  float32x2_t a = vld1_f32 (a1);
-+  float32x2_t b = vld1_f32 (b1);
-+  float32x2_t c = vmaxnm_f32 (a, b);
-+  float32_t actual[2];
-+  vst1_f32 (actual, c);
-+
-+  for (int i = 0; i < 2; ++i)
-+    if (actual[i] != e[i])
-+      abort ();
-+}
-+
-+void __attribute__ ((noinline))
-+test_vmaxnm_f32__quiet_NaN_both_args ()
-+{
-+  /* When given a quiet NaN, vmaxnm returns the other operand.
-+     In this test case we have NaNs in both operands.  */
-+  float32_t n = __builtin_nanf ("");
-+  float32_t a1[] = {n,2};
-+  float32_t b1[] = {1,n};
-+  float32_t e[] = {1,2};
-+  float32x2_t a = vld1_f32 (a1);
-+  float32x2_t b = vld1_f32 (b1);
-+  float32x2_t c = vmaxnm_f32 (a, b);
-+  float32_t actual[2];
-+  vst1_f32 (actual, c);
-+
-+  for (int i = 0; i < 2; ++i)
-+    if (actual[i] != e[i])
-+      abort ();
-+}
-+
-+void __attribute__ ((noinline))
-+test_vmaxnm_f32__zero_both_args ()
-+{
-+  /* For 0 and -0, vmaxnm returns 0.  Since 0 == -0, check sign bit.  */
-+  float32_t a1[] = {0.0, 0.0};
-+  float32_t b1[] = {-0.0, -0.0};
-+  float32_t e[] = {0.0, 0.0};
-+
-+  float32x2_t a = vld1_f32 (a1);
-+  float32x2_t b = vld1_f32 (b1);
-+  float32x2_t c = vmaxnm_f32 (a, b);
-+
-+  float32_t actual1[2];
-+  vst1_f32 (actual1, c);
-+
-+  for (int i = 0; i < 2; ++i)
-+    if (actual1[i] != e[i] || __builtin_signbit (actual1[i]) != 0)
-+      abort ();
-+}
-+
-+void __attribute__ ((noinline))
-+test_vmaxnm_f32__inf_both_args ()
-+{
-+  /* The max of inf and inf is inf.  The max of -inf and -inf is -inf.  */
-+  float32_t inf = __builtin_huge_valf ();
-+  float32_t a1[] = {inf, -inf};
-+  float32_t b1[] = {inf, -inf};
-+  float32_t e[] = {inf, -inf};
-+
-+  float32x2_t a = vld1_f32 (a1);
-+  float32x2_t b = vld1_f32 (b1);
-+  float32x2_t c = vmaxnm_f32 (a, b);
-+
-+  float32_t actual1[2];
-+  vst1_f32 (actual1, c);
-+
-+  for (int i = 0; i < 2; ++i)
-+    if (actual1[i] != e[i])
-+      abort ();
-+}
-+
-+void __attribute__ ((noinline))
-+test_vmaxnm_f32__two_quiet_NaNs_both_args ()
-+{
-+  /* When given 2 NaNs, return a NaN.  Since a NaN is not equal to anything,
-+     not even another NaN, use __builtin_isnan () to check.  */
-+  float32_t n = __builtin_nanf ("");
-+  float32_t a1[] = {n,n};
-+  float32_t b1[] = {n,n};
-+  float32_t e[] = {n,n};
-+  float32x2_t a = vld1_f32 (a1);
-+  float32x2_t b = vld1_f32 (b1);
-+  float32x2_t c = vmaxnm_f32 (a, b);
-+  float32_t actual[2];
-+  vst1_f32 (actual, c);
-+
-+  for (int i = 0; i < 2; ++i)
-+    if (!__builtin_isnan (actual[i]))
-+      abort ();
-+}
-+
-+int
-+main ()
-+{
-+  test_vmaxnm_f32__regular_input1 ();
-+  test_vmaxnm_f32__regular_input2 ();
-+  test_vmaxnm_f32__quiet_NaN_one_arg ();
-+  test_vmaxnm_f32__quiet_NaN_both_args ();
-+  test_vmaxnm_f32__zero_both_args ();
-+  test_vmaxnm_f32__inf_both_args ();
-+  test_vmaxnm_f32__two_quiet_NaNs_both_args ();
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler-times "vmaxnm\.f32\t\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+\n" 7 } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/simd/vmaxnmq_f32_1.c
-@@ -0,0 +1,160 @@
-+/* Test the `vmaxnmqf32' ARM Neon intrinsic.  */
-+
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_neon_hw } */
-+/* { dg-options "-save-temps -O3 -march=armv8-a" } */
-+/* { dg-add-options arm_v8_neon } */
-+
-+#include "arm_neon.h"
-+
-+extern void abort ();
-+
-+void __attribute__ ((noinline))
-+test_vmaxnmq_f32__regular_input1 ()
-+{
-+  float32_t a1[] = {1,2,5,6};
-+  float32_t b1[] = {3,4,7,8};
-+  float32x4_t a = vld1q_f32 (a1);
-+  float32x4_t b = vld1q_f32 (b1);
-+  float32x4_t c = vmaxnmq_f32 (a, b);
-+  float32_t actual[4];
-+  vst1q_f32 (actual, c);
-+
-+  for (int i = 0; i < 4; ++i)
-+    if (actual[i] != b1[i])
-+      abort ();
-+}
-+
-+void __attribute__ ((noinline))
-+test_vmaxnmq_f32__regular_input2 ()
-+{
-+  float32_t a1[] = {3,2,7,6};
-+  float32_t b1[] = {1,4,5,8};
-+  float32_t e[] = {3,4,7,8};
-+  float32x4_t a = vld1q_f32 (a1);
-+  float32x4_t b = vld1q_f32 (b1);
-+  float32x4_t c = vmaxnmq_f32 (a, b);
-+  float32_t actual[4];
-+  vst1q_f32 (actual, c);
-+
-+  for (int i = 0; i < 4; ++i)
-+    if (actual[i] != e[i])
-+      abort ();
-+}
-+
-+
-+void __attribute__ ((noinline))
-+test_vmaxnmq_f32__quiet_NaN_one_arg ()
-+{
-+  /* When given a quiet NaN, vmaxnmq returns the other operand.
-+     In this test case we have NaNs in only one operand.  */
-+  float32_t n = __builtin_nanf ("");
-+  float32_t a1[] = {1,2,3,4};
-+  float32_t b1[] = {n,n,n,n};
-+  float32_t e[] = {1,2,3,4};
-+  float32x4_t a = vld1q_f32 (a1);
-+  float32x4_t b = vld1q_f32 (b1);
-+  float32x4_t c = vmaxnmq_f32 (a, b);
-+  float32_t actual[4];
-+  vst1q_f32 (actual, c);
-+
-+  for (int i = 0; i < 4; ++i)
-+    if (actual[i] != e[i])
-+      abort ();
-+}
-+
-+void __attribute__ ((noinline))
-+test_vmaxnmq_f32__quiet_NaN_both_args ()
-+{
-+  /* When given a quiet NaN, vmaxnmq returns the other operand.
-+     In this test case we have NaNs in both operands.  */
-+  float32_t n = __builtin_nanf ("");
-+  float32_t a1[] = {n,2,n,4};
-+  float32_t b1[] = {1,n,3,n};
-+  float32_t e[] = {1,2,3,4};
-+  float32x4_t a = vld1q_f32 (a1);
-+  float32x4_t b = vld1q_f32 (b1);
-+  float32x4_t c = vmaxnmq_f32 (a, b);
-+  float32_t actual[4];
-+  vst1q_f32 (actual, c);
-+
-+  for (int i = 0; i < 4; ++i)
-+    if (actual[i] != e[i])
-+      abort ();
-+}
-+
-+void __attribute__ ((noinline))
-+test_vmaxnmq_f32__zero_both_args ()
-+{
-+  /* For 0 and -0, vmaxnmq returns 0.  Since 0 == -0, check sign bit.  */
-+  float32_t a1[] = {0.0, 0.0, -0.0, -0.0};
-+  float32_t b1[] = {-0.0, -0.0, 0.0, 0.0};
-+  float32_t e[] = {0.0, 0.0, 0.0, 0.0};
-+
-+  float32x4_t a = vld1q_f32 (a1);
-+  float32x4_t b = vld1q_f32 (b1);
-+  float32x4_t c = vmaxnmq_f32 (a, b);
-+
-+  float32_t actual1[4];
-+  vst1q_f32 (actual1, c);
-+
-+  for (int i = 0; i < 4; ++i)
-+    if (actual1[i] != e[i] || __builtin_signbit (actual1[i]) != 0)
-+      abort ();
-+}
-+
-+void __attribute__ ((noinline))
-+test_vmaxnmq_f32__inf_both_args ()
-+{
-+  /* The max of inf and inf is inf.  The max of -inf and -inf is -inf.  */
-+  float32_t inf = __builtin_huge_valf ();
-+  float32_t a1[] = {inf, -inf, inf, inf};
-+  float32_t b1[] = {inf, -inf, -inf, -inf};
-+  float32_t e[] = {inf, -inf, inf, inf};
-+
-+  float32x4_t a = vld1q_f32 (a1);
-+  float32x4_t b = vld1q_f32 (b1);
-+  float32x4_t c = vmaxnmq_f32 (a, b);
-+
-+  float32_t actual1[4];
-+  vst1q_f32 (actual1, c);
-+
-+  for (int i = 0; i < 4; ++i)
-+    if (actual1[i] != e[i])
-+      abort ();
-+}
-+
-+void __attribute__ ((noinline))
-+test_vmaxnmq_f32__two_quiet_NaNs_both_args ()
-+{
-+  /* When given 2 NaNs, return a NaN.  Since a NaN is not equal to anything,
-+     not even another NaN, use __builtin_isnan () to check.  */
-+  float32_t n = __builtin_nanf ("");
-+  float32_t a1[] = {n,n,n,n};
-+  float32_t b1[] = {n,n,n,n};
-+  float32_t e[] = {n,n};
-+  float32x4_t a = vld1q_f32 (a1);
-+  float32x4_t b = vld1q_f32 (b1);
-+  float32x4_t c = vmaxnmq_f32 (a, b);
-+  float32_t actual[4];
-+  vst1q_f32 (actual, c);
-+
-+  for (int i = 0; i < 4; ++i)
-+    if (!__builtin_isnan (actual[i]))
-+      abort ();
-+}
-+
-+int
-+main ()
-+{
-+  test_vmaxnmq_f32__regular_input1 ();
-+  test_vmaxnmq_f32__regular_input2 ();
-+  test_vmaxnmq_f32__quiet_NaN_one_arg ();
-+  test_vmaxnmq_f32__quiet_NaN_both_args ();
-+  test_vmaxnmq_f32__zero_both_args ();
-+  test_vmaxnmq_f32__inf_both_args ();
-+  test_vmaxnmq_f32__two_quiet_NaNs_both_args ();
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler-times "vmaxnm\.f32\t\[qQ\]\[0-9\]+, ?\[qQ\]\[0-9\]+, ?\[qQ\]\[0-9\]+\n" 7 } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/simd/vminnm_f32_1.c
-@@ -0,0 +1,159 @@
-+/* Test the `vminnmf32' ARM Neon intrinsic.  */
-+
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_neon_hw } */
-+/* { dg-options "-save-temps -O3 -march=armv8-a" } */
-+/* { dg-add-options arm_v8_neon } */
-+
-+#include "arm_neon.h"
-+
-+extern void abort ();
-+
-+void __attribute__ ((noinline))
-+test_vminnm_f32__regular_input1 ()
-+{
-+  float32_t a1[] = {1,2};
-+  float32_t b1[] = {3,4};
-+  float32x2_t a = vld1_f32 (a1);
-+  float32x2_t b = vld1_f32 (b1);
-+  float32x2_t c = vminnm_f32 (a, b);
-+  float32_t actual[2];
-+  vst1_f32 (actual, c);
-+
-+  for (int i = 0; i < 2; ++i)
-+    if (actual[i] != a1[i])
-+      abort ();
-+}
-+
-+void __attribute__ ((noinline))
-+test_vminnm_f32__regular_input2 ()
-+{
-+  float32_t a1[] = {3,2};
-+  float32_t b1[] = {1,4};
-+  float32_t e[] = {1,2};
-+  float32x2_t a = vld1_f32 (a1);
-+  float32x2_t b = vld1_f32 (b1);
-+  float32x2_t c = vminnm_f32 (a, b);
-+  float32_t actual[2];
-+  vst1_f32 (actual, c);
-+
-+  for (int i = 0; i < 2; ++i)
-+    if (actual[i] != e[i])
-+      abort ();
-+}
-+
-+void __attribute__ ((noinline))
-+test_vminnm_f32__quiet_NaN_one_arg ()
-+{
-+  /* When given a quiet NaN, vminnm returns the other operand.
-+     In this test case we have NaNs in only one operand.  */
-+  float32_t n = __builtin_nanf ("");
-+  float32_t a1[] = {1,2};
-+  float32_t b1[] = {n,n};
-+  float32_t e[] = {1,2};
-+  float32x2_t a = vld1_f32 (a1);
-+  float32x2_t b = vld1_f32 (b1);
-+  float32x2_t c = vminnm_f32 (a, b);
-+  float32_t actual[2];
-+  vst1_f32 (actual, c);
-+
-+  for (int i = 0; i < 2; ++i)
-+    if (actual[i] != e[i])
-+      abort ();
-+}
-+
-+void __attribute__ ((noinline))
-+test_vminnm_f32__quiet_NaN_both_args ()
-+{
-+  /* When given a quiet NaN, vminnm returns the other operand.
-+     In this test case we have NaNs in both operands.  */
-+  float32_t n = __builtin_nanf ("");
-+  float32_t a1[] = {n,2};
-+  float32_t b1[] = {1,n};
-+  float32_t e[] = {1,2};
-+  float32x2_t a = vld1_f32 (a1);
-+  float32x2_t b = vld1_f32 (b1);
-+  float32x2_t c = vminnm_f32 (a, b);
-+  float32_t actual[2];
-+  vst1_f32 (actual, c);
-+
-+  for (int i = 0; i < 2; ++i)
-+    if (actual[i] != e[i])
-+      abort ();
-+}
-+
-+void __attribute__ ((noinline))
-+test_vminnm_f32__zero_both_args ()
-+{
-+  /* For 0 and -0, vminnm returns -0.  Since 0 == -0, check sign bit.  */
-+  float32_t a1[] = {0.0,0.0};
-+  float32_t b1[] = {-0.0, -0.0};
-+  float32_t e[] = {-0.0, -0.0};
-+
-+  float32x2_t a = vld1_f32 (a1);
-+  float32x2_t b = vld1_f32 (b1);
-+  float32x2_t c = vminnm_f32 (a, b);
-+
-+  float32_t actual1[2];
-+  vst1_f32 (actual1, c);
-+
-+  for (int i = 0; i < 2; ++i)
-+    if (actual1[i] != e[i] || __builtin_signbit (actual1[i]) == 0)
-+      abort ();
-+}
-+
-+void __attribute__ ((noinline))
-+test_vminnm_f32__inf_both_args ()
-+{
-+  /* The min of inf and inf is inf.  The min of -inf and -inf is -inf.  */
-+  float32_t inf = __builtin_huge_valf ();
-+  float32_t a1[] = {inf, -inf};
-+  float32_t b1[] = {inf, -inf};
-+  float32_t e[] = {inf, -inf};
-+
-+  float32x2_t a = vld1_f32 (a1);
-+  float32x2_t b = vld1_f32 (b1);
-+  float32x2_t c = vminnm_f32 (a, b);
-+
-+  float32_t actual1[2];
-+  vst1_f32 (actual1, c);
-+
-+  for (int i = 0; i < 2; ++i)
-+    if (actual1[i] != e[i])
-+      abort ();
-+}
-+
-+void __attribute__ ((noinline))
-+test_vminnm_f32__two_quiet_NaNs_both_args ()
-+{
-+  /* When given 2 NaNs, return a NaN.  Since a NaN is not equal to anything,
-+     not even another NaN, use __builtin_isnan () to check.  */
-+  float32_t n = __builtin_nanf ("");
-+  float32_t a1[] = {n,n};
-+  float32_t b1[] = {n,n};
-+  float32_t e[] = {n,n};
-+  float32x2_t a = vld1_f32 (a1);
-+  float32x2_t b = vld1_f32 (b1);
-+  float32x2_t c = vminnm_f32 (a, b);
-+  float32_t actual[2];
-+  vst1_f32 (actual, c);
-+
-+  for (int i = 0; i < 2; ++i)
-+    if (!__builtin_isnan (actual[i]))
-+      abort ();
-+}
-+
-+int
-+main ()
-+{
-+  test_vminnm_f32__regular_input1 ();
-+  test_vminnm_f32__regular_input2 ();
-+  test_vminnm_f32__quiet_NaN_one_arg ();
-+  test_vminnm_f32__quiet_NaN_both_args ();
-+  test_vminnm_f32__zero_both_args ();
-+  test_vminnm_f32__inf_both_args ();
-+  test_vminnm_f32__two_quiet_NaNs_both_args ();
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler-times "vminnm\.f32\t\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+\n" 7 } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/simd/vminnmq_f32_1.c
-@@ -0,0 +1,159 @@
-+/* Test the `vminnmqf32' ARM Neon intrinsic.  */
-+
-+/* { dg-do run } */
-+/* { dg-require-effective-target arm_v8_neon_hw } */
-+/* { dg-options "-save-temps -O3 -march=armv8-a" } */
-+/* { dg-add-options arm_v8_neon } */
-+
-+#include "arm_neon.h"
-+
-+extern void abort ();
-+
-+void __attribute__ ((noinline))
-+test_vminnmq_f32__regular_input1 ()
-+{
-+  float32_t a1[] = {1,2,5,6};
-+  float32_t b1[] = {3,4,7,8};
-+  float32x4_t a = vld1q_f32 (a1);
-+  float32x4_t b = vld1q_f32 (b1);
-+  float32x4_t c = vminnmq_f32 (a, b);
-+  float32_t actual[4];
-+  vst1q_f32 (actual, c);
-+
-+  for (int i = 0; i < 4; ++i)
-+    if (actual[i] != a1[i])
-+      abort ();
-+}
-+
-+void __attribute__ ((noinline))
-+test_vminnmq_f32__regular_input2 ()
-+{
-+  float32_t a1[] = {3,2,7,6};
-+  float32_t b1[] = {1,4,5,8};
-+  float32_t e[] = {1,2,5,6};
-+  float32x4_t a = vld1q_f32 (a1);
-+  float32x4_t b = vld1q_f32 (b1);
-+  float32x4_t c = vminnmq_f32 (a, b);
-+  float32_t actual[4];
-+  vst1q_f32 (actual, c);
-+
-+  for (int i = 0; i < 4; ++i)
-+    if (actual[i] != e[i])
-+      abort ();
-+}
-+
-+void __attribute__ ((noinline))
-+test_vminnmq_f32__quiet_NaN_one_arg ()
-+{
-+  /* When given a quiet NaN, vminnmq returns the other operand.
-+     In this test case we have NaNs in only one operand.  */
-+  float32_t n = __builtin_nanf ("");
-+  float32_t a1[] = {1,2,3,4};
-+  float32_t b1[] = {n,n,n,n};
-+  float32_t e[] = {1,2,3,4};
-+  float32x4_t a = vld1q_f32 (a1);
-+  float32x4_t b = vld1q_f32 (b1);
-+  float32x4_t c = vminnmq_f32 (a, b);
-+  float32_t actual[4];
-+  vst1q_f32 (actual, c);
-+
-+  for (int i = 0; i < 4; ++i)
-+    if (actual[i] != e[i])
-+      abort ();
-+}
-+
-+void __attribute__ ((noinline))
-+test_vminnmq_f32__quiet_NaN_both_args ()
-+{
-+  /* When given a quiet NaN, vminnmq returns the other operand.
-+     In this test case we have NaNs in both operands.  */
-+  float32_t n = __builtin_nanf ("");
-+  float32_t a1[] = {n,2,n,4};
-+  float32_t b1[] = {1,n,3,n};
-+  float32_t e[] = {1,2,3,4};
-+  float32x4_t a = vld1q_f32 (a1);
-+  float32x4_t b = vld1q_f32 (b1);
-+  float32x4_t c = vminnmq_f32 (a, b);
-+  float32_t actual[4];
-+  vst1q_f32 (actual, c);
-+
-+  for (int i = 0; i < 4; ++i)
-+    if (actual[i] != e[i])
-+      abort ();
-+}
-+
-+void __attribute__ ((noinline))
-+test_vminnmq_f32__zero_both_args ()
-+{
-+  /* For 0 and -0, vminnmq returns -0.  Since 0 == -0, check sign bit.  */
-+  float32_t a1[] = {0.0, 0.0, -0.0, -0.0};
-+  float32_t b1[] = {-0.0, -0.0, 0.0, 0.0};
-+  float32_t e[] = {-0.0, -0.0, -0.0, -0.0};
-+
-+  float32x4_t a = vld1q_f32 (a1);
-+  float32x4_t b = vld1q_f32 (b1);
-+  float32x4_t c = vminnmq_f32 (a, b);
-+
-+  float32_t actual1[4];
-+  vst1q_f32 (actual1, c);
-+
-+  for (int i = 0; i < 4; ++i)
-+    if (actual1[i] != e[i] || __builtin_signbit (actual1[i]) == 0)
-+      abort ();
-+}
-+
-+void __attribute__ ((noinline))
-+test_vminnmq_f32__inf_both_args ()
-+{
-+  /* The min of inf and inf is inf.  The min of -inf and -inf is -inf.  */
-+  float32_t inf = __builtin_huge_valf ();
-+  float32_t a1[] = {inf, -inf, inf, inf};
-+  float32_t b1[] = {inf, -inf, -inf, -inf};
-+  float32_t e[] = {inf, -inf, -inf, -inf};
-+
-+  float32x4_t a = vld1q_f32 (a1);
-+  float32x4_t b = vld1q_f32 (b1);
-+  float32x4_t c = vminnmq_f32 (a, b);
-+
-+  float32_t actual1[4];
-+  vst1q_f32 (actual1, c);
-+
-+  for (int i = 0; i < 4; ++i)
-+    if (actual1[i] != e[i])
-+      abort ();
-+}
-+
-+void __attribute__ ((noinline))
-+test_vminnmq_f32__two_quiet_NaNs_both_args ()
-+{
-+  /* When given 2 NaNs, return a NaN.  Since a NaN is not equal to anything,
-+     not even another NaN, use __builtin_isnan () to check.  */
-+  float32_t n = __builtin_nanf ("");
-+  float32_t a1[] = {n,n,n,n};
-+  float32_t b1[] = {n,n,n,n};
-+  float32_t e[] = {n,n};
-+  float32x4_t a = vld1q_f32 (a1);
-+  float32x4_t b = vld1q_f32 (b1);
-+  float32x4_t c = vminnmq_f32 (a, b);
-+  float32_t actual[4];
-+  vst1q_f32 (actual, c);
-+
-+  for (int i = 0; i < 4; ++i)
-+    if (!__builtin_isnan (actual[i]))
-+      abort ();
-+}
-+
-+int
-+main ()
-+{
-+  test_vminnmq_f32__regular_input1 ();
-+  test_vminnmq_f32__regular_input2 ();
-+  test_vminnmq_f32__quiet_NaN_one_arg ();
-+  test_vminnmq_f32__quiet_NaN_both_args ();
-+  test_vminnmq_f32__zero_both_args ();
-+  test_vminnmq_f32__inf_both_args ();
-+  test_vminnmq_f32__two_quiet_NaNs_both_args ();
-+  return 0;
-+}
-+
-+/* { dg-final { scan-assembler-times "vminnm\.f32\t\[qQ\]\[0-9\]+, ?\[qQ\]\[0-9\]+, ?\[qQ\]\[0-9\]+\n" 7 } } */
---- a/src/gcc/testsuite/gcc.target/arm/unsigned-extend-2.c
-+++ b/src/gcc/testsuite/gcc.target/arm/unsigned-extend-2.c
-@@ -2,13 +2,13 @@
- /* { dg-require-effective-target arm_thumb2_ok } */
- /* { dg-options "-O" } */
- 
--unsigned short foo (unsigned short x)
-+unsigned short foo (unsigned short x, unsigned short c)
- {
-   unsigned char i = 0;
-   for (i = 0; i < 8; i++)
-     {
-       x >>= 1;
--      x &= 0x7fff;
-+      x &= c;
-     }
-   return x;
- }
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/vect-vcvt.c
-@@ -0,0 +1,27 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_neon_ok } */
-+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details -mvectorize-with-neon-double" } */
-+/* { dg-add-options arm_neon } */
-+
-+#define N 32
-+
-+int ib[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
-+float fa[N];
-+int ia[N];
-+
-+int convert()
-+{
-+  int i;
-+
-+  /* int -> float */
-+  for (i = 0; i < N; i++)
-+    fa[i] = (float) ib[i];
-+
-+  /* float -> int */
-+  for (i = 0; i < N; i++)
-+    ia[i] = (int) fa[i];
-+
-+  return 0;
-+}
-+
-+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/vect-vcvtq.c
-@@ -0,0 +1,27 @@
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_neon_ok } */
-+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details" } */
-+/* { dg-add-options arm_neon } */
-+
-+#define N 32
-+
-+int ib[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
-+float fa[N];
-+int ia[N];
-+
-+int convert()
-+{
-+  int i;
-+
-+  /* int -> float */
-+  for (i = 0; i < N; i++)
-+    fa[i] = (float) ib[i];
-+
-+  /* float -> int */
-+  for (i = 0; i < N; i++)
-+    ia[i] = (int) fa[i];
-+
-+  return 0;
-+}
-+
-+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/vfp-shift-a2t2.c
-@@ -0,0 +1,27 @@
-+/* Check that NEON vector shifts support immediate values == size.  /*
-+
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_neon_ok } */
-+/* { dg-options "-save-temps" } */
-+/* { dg-add-options arm_neon } */
-+
-+#include <arm_neon.h>
-+
-+uint16x8_t test_vshll_n_u8 (uint8x8_t a)
-+{
-+    return vshll_n_u8(a, 8);
-+}
-+
-+uint32x4_t test_vshll_n_u16 (uint16x4_t a)
-+{   
-+    return vshll_n_u16(a, 16);
-+}
-+
-+uint64x2_t test_vshll_n_u32 (uint32x2_t a)
-+{
-+    return vshll_n_u32(a, 32);
-+}
-+
-+/* { dg-final { scan-assembler "vshll\.u16\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-+/* { dg-final { scan-assembler "vshll\.u32\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-+/* { dg-final { scan-assembler "vshll\.u8\[ 	\]+\[qQ\]\[0-9\]+, \[dD\]\[0-9\]+, #\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
---- /dev/null
-+++ b/src/gcc/testsuite/gcc.target/arm/vst1Q_laneu64-1.c
-@@ -0,0 +1,25 @@
-+/* Test the `vst1Q_laneu64' ARM Neon intrinsic.  */
-+
-+/* Detect ICE in the case of unaligned memory address.  */
-+
-+/* { dg-do compile } */
-+/* { dg-require-effective-target arm_neon_ok } */
-+/* { dg-add-options arm_neon } */
-+
-+#include "arm_neon.h"
-+
-+unsigned char dummy_store[1000];
-+
-+void
-+foo (unsigned char* addr)
-+{
-+  uint8x16_t vdata = vld1q_u8 (addr);
-+  vst1q_lane_u64 ((uint64_t*) &dummy_store, vreinterpretq_u64_u8 (vdata), 0);
-+}
-+
-+uint64_t
-+bar (uint64x2_t vdata)
-+{
-+  vdata = vld1q_lane_u64 ((uint64_t*) &dummy_store, vdata, 0);
-+  return vgetq_lane_u64 (vdata, 0);
-+}
---- a/src/gcc/testsuite/lib/gcc-dg.exp
-+++ b/src/gcc/testsuite/lib/gcc-dg.exp
-@@ -403,6 +403,7 @@ if { [info procs ${tool}_load] != [list] \
- 	    switch [lindex $result 0] {
- 		"pass" { set status "fail" }
- 		"fail" { set status "pass" }
-+		default { set status [lindex $result 0] }
- 	    }
- 	    set result [list $status [lindex $result 1]]
- 	}
---- a/src/gcc/testsuite/lib/target-supports.exp
-+++ b/src/gcc/testsuite/lib/target-supports.exp
-@@ -252,6 +252,20 @@ proc check_runtime {prop args} {
-     }]
- }
- 
-+# Return 1 if GCC was configured with $pattern.
-+proc check_configured_with { pattern } {
-+    global tool
-+
-+    set gcc_output [${tool}_target_compile "-v" "" "none" ""]
-+    if { [ regexp "Configured with: \[^\n\]*$pattern" $gcc_output ] } {
-+        verbose "Matched: $pattern" 2
-+        return 1
-+    }
-+
-+    verbose "Failed to match: $pattern" 2
-+    return 0
-+}
-+
- ###############################
- # proc check_weak_available { }
- ###############################
-@@ -2936,6 +2950,28 @@ proc add_options_for_arm_v8_1a_neon { flags } {
-     return "$flags $et_arm_v8_1a_neon_flags -march=armv8.1-a"
- }
- 
-+# Add the options needed for ARMv8.2 with the scalar FP16 extension.
-+# Also adds the ARMv8 FP options for ARM and for AArch64.
-+
-+proc add_options_for_arm_v8_2a_fp16_scalar { flags } {
-+    if { ! [check_effective_target_arm_v8_2a_fp16_scalar_ok] } {
-+	return "$flags"
-+    }
-+    global et_arm_v8_2a_fp16_scalar_flags
-+    return "$flags $et_arm_v8_2a_fp16_scalar_flags"
-+}
-+
-+# Add the options needed for ARMv8.2 with the FP16 extension.  Also adds
-+# the ARMv8 NEON options for ARM and for AArch64.
-+
-+proc add_options_for_arm_v8_2a_fp16_neon { flags } {
-+    if { ! [check_effective_target_arm_v8_2a_fp16_neon_ok] } {
-+	return "$flags"
-+    }
-+    global et_arm_v8_2a_fp16_neon_flags
-+    return "$flags $et_arm_v8_2a_fp16_neon_flags"
-+}
-+
- proc add_options_for_arm_crc { flags } {
-     if { ! [check_effective_target_arm_crc_ok] } {
-         return "$flags"
-@@ -3022,23 +3058,25 @@ proc check_effective_target_arm_crc_ok { } {
- 
- proc check_effective_target_arm_neon_fp16_ok_nocache { } {
-     global et_arm_neon_fp16_flags
-+    global et_arm_neon_flags
-     set et_arm_neon_fp16_flags ""
--    if { [check_effective_target_arm32] } {
-+    if { [check_effective_target_arm32]
-+	 && [check_effective_target_arm_neon_ok] } {
- 	foreach flags {"" "-mfloat-abi=softfp" "-mfpu=neon-fp16"
- 		       "-mfpu=neon-fp16 -mfloat-abi=softfp"
- 		       "-mfp16-format=ieee"
- 		       "-mfloat-abi=softfp -mfp16-format=ieee"
- 		       "-mfpu=neon-fp16 -mfp16-format=ieee"
- 		       "-mfpu=neon-fp16 -mfloat-abi=softfp -mfp16-format=ieee"} {
--	    if { [check_no_compiler_messages_nocache arm_neon_fp_16_ok object {
-+	    if { [check_no_compiler_messages_nocache arm_neon_fp16_ok object {
- 		#include "arm_neon.h"
- 		float16x4_t
- 		foo (float32x4_t arg)
- 		{
-                   return vcvt_f16_f32 (arg);
- 		}
--	    } "$flags"] } {
--		set et_arm_neon_fp16_flags $flags
-+	    } "$et_arm_neon_flags $flags"] } {
-+		set et_arm_neon_fp16_flags [concat $et_arm_neon_flags $flags]
- 		return 1
- 	    }
- 	}
-@@ -3075,6 +3113,65 @@ proc add_options_for_arm_neon_fp16 { flags } {
-     return "$flags $et_arm_neon_fp16_flags"
- }
- 
-+# Return 1 if this is an ARM target supporting the FP16 alternative
-+# format.  Some multilibs may be incompatible with the options needed.  Also
-+# set et_arm_neon_fp16_flags to the best options to add.
-+
-+proc check_effective_target_arm_fp16_alternative_ok_nocache { } {
-+    global et_arm_neon_fp16_flags
-+    set et_arm_neon_fp16_flags ""
-+    if { [check_effective_target_arm32] } {
-+	foreach flags {"" "-mfloat-abi=softfp" "-mfpu=neon-fp16"
-+		       "-mfpu=neon-fp16 -mfloat-abi=softfp"} {
-+	    if { [check_no_compiler_messages_nocache \
-+		      arm_fp16_alternative_ok object {
-+		#if !defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+		#error __ARM_FP16_FORMAT_ALTERNATIVE not defined
-+		#endif
-+	    } "$flags -mfp16-format=alternative"] } {
-+		set et_arm_neon_fp16_flags "$flags -mfp16-format=alternative"
-+		return 1
-+	    }
-+	}
-+    }
-+
-+    return 0
-+}
-+
-+proc check_effective_target_arm_fp16_alternative_ok { } {
-+    return [check_cached_effective_target arm_fp16_alternative_ok \
-+		check_effective_target_arm_fp16_alternative_ok_nocache]
-+}
-+
-+# Return 1 if this is an ARM target supports specifying the FP16 none
-+# format.  Some multilibs may be incompatible with the options needed.
-+
-+proc check_effective_target_arm_fp16_none_ok_nocache { } {
-+    if { [check_effective_target_arm32] } {
-+	foreach flags {"" "-mfloat-abi=softfp" "-mfpu=neon-fp16"
-+		       "-mfpu=neon-fp16 -mfloat-abi=softfp"} {
-+	    if { [check_no_compiler_messages_nocache \
-+		      arm_fp16_none_ok object {
-+		#if defined (__ARM_FP16_FORMAT_ALTERNATIVE)
-+		#error __ARM_FP16_FORMAT_ALTERNATIVE defined
-+		#endif
-+		#if defined (__ARM_FP16_FORMAT_IEEE)
-+		#error __ARM_FP16_FORMAT_IEEE defined
-+		#endif
-+	    } "$flags -mfp16-format=none"] } {
-+		return 1
-+	    }
-+	}
-+    }
-+
-+    return 0
-+}
-+
-+proc check_effective_target_arm_fp16_none_ok { } {
-+    return [check_cached_effective_target arm_fp16_none_ok \
-+		check_effective_target_arm_fp16_none_ok_nocache]
-+}
-+
- # Return 1 if this is an ARM target supporting -mfpu=neon-fp-armv8
- # -mfloat-abi=softfp or equivalent options.  Some multilibs may be
- # incompatible with these options.  Also set et_arm_v8_neon_flags to the
-@@ -3117,8 +3214,10 @@ proc check_effective_target_arm_v8_neon_ok { } {
- 
- proc check_effective_target_arm_neonv2_ok_nocache { } {
-     global et_arm_neonv2_flags
-+    global et_arm_neon_flags
-     set et_arm_neonv2_flags ""
--    if { [check_effective_target_arm32] } {
-+    if { [check_effective_target_arm32]
-+	 && [check_effective_target_arm_neon_ok] } {
- 	foreach flags {"" "-mfloat-abi=softfp" "-mfpu=neon-vfpv4" "-mfpu=neon-vfpv4 -mfloat-abi=softfp"} {
- 	    if { [check_no_compiler_messages_nocache arm_neonv2_ok object {
- 		#include "arm_neon.h"
-@@ -3127,8 +3226,8 @@ proc check_effective_target_arm_neonv2_ok_nocache { } {
-                 {
-                   return vfma_f32 (a, b, c);
-                 }
--	    } "$flags"] } {
--		set et_arm_neonv2_flags $flags
-+	    } "$et_arm_neon_flags $flags"] } {
-+		set et_arm_neonv2_flags [concat $et_arm_neon_flags $flags]
- 		return 1
- 	    }
- 	}
-@@ -3142,9 +3241,9 @@ proc check_effective_target_arm_neonv2_ok { } {
- 		check_effective_target_arm_neonv2_ok_nocache]
- }
- 
--# Add the options needed for NEON.  We need either -mfloat-abi=softfp
--# or -mfloat-abi=hard, but if one is already specified by the
--# multilib, use it.
-+# Add the options needed for VFP FP16 support.  We need either
-+# -mfloat-abi=softfp or -mfloat-abi=hard.  If one is already specified by
-+# the multilib, use it.
- 
- proc add_options_for_arm_fp16 { flags } {
-     if { ! [check_effective_target_arm_fp16_ok] } {
-@@ -3154,9 +3253,32 @@ proc add_options_for_arm_fp16 { flags } {
-     return "$flags $et_arm_fp16_flags"
- }
- 
-+# Add the options needed to enable support for IEEE format
-+# half-precision support.  This is valid for ARM targets.
-+
-+proc add_options_for_arm_fp16_ieee { flags } {
-+    if { ! [check_effective_target_arm_fp16_ok] } {
-+	return "$flags"
-+    }
-+    global et_arm_fp16_flags
-+    return "$flags $et_arm_fp16_flags -mfp16-format=ieee"
-+}
-+
-+# Add the options needed to enable support for ARM Alternative format
-+# half-precision support.  This is valid for ARM targets.
-+
-+proc add_options_for_arm_fp16_alternative { flags } {
-+    if { ! [check_effective_target_arm_fp16_ok] } {
-+	return "$flags"
-+    }
-+    global et_arm_fp16_flags
-+    return "$flags $et_arm_fp16_flags -mfp16-format=alternative"
-+}
-+
- # Return 1 if this is an ARM target that can support a VFP fp16 variant.
- # Skip multilibs that are incompatible with these options and set
--# et_arm_fp16_flags to the best options to add.
-+# et_arm_fp16_flags to the best options to add.  This test is valid for
-+# ARM only.
- 
- proc check_effective_target_arm_fp16_ok_nocache { } {
-     global et_arm_fp16_flags
-@@ -3164,7 +3286,10 @@ proc check_effective_target_arm_fp16_ok_nocache { } {
-     if { ! [check_effective_target_arm32] } {
- 	return 0;
-     }
--    if [check-flags [list "" { *-*-* } { "-mfpu=*" } { "-mfpu=*fp16*" "-mfpu=*fpv[4-9]*" "-mfpu=*fpv[1-9][0-9]*" } ]] {
-+    if [check-flags \
-+	    [list "" { *-*-* } { "-mfpu=*" } \
-+		 { "-mfpu=*fp16*" "-mfpu=*fpv[4-9]*" \
-+		       "-mfpu=*fpv[1-9][0-9]*" "-mfpu=*fp-armv8*" } ]] {
- 	# Multilib flags would override -mfpu.
- 	return 0
-     }
-@@ -3200,6 +3325,28 @@ proc check_effective_target_arm_fp16_ok { } {
- 		check_effective_target_arm_fp16_ok_nocache]
- }
- 
-+# Return 1 if the target supports executing VFP FP16 instructions, 0
-+# otherwise.  This test is valid for ARM only.
-+
-+proc check_effective_target_arm_fp16_hw { } {
-+    if {! [check_effective_target_arm_fp16_ok] } {
-+	return 0
-+    }
-+    global et_arm_fp16_flags
-+    check_runtime_nocache arm_fp16_hw {
-+	int
-+	main (int argc, char **argv)
-+	{
-+	  __fp16 a = 1.0;
-+	  float r;
-+	  asm ("vcvtb.f32.f16 %0, %1"
-+	       : "=w" (r) : "w" (a)
-+	       : /* No clobbers.  */);
-+	  return (r == 1.0) ? 0 : 1;
-+	}
-+    } "$et_arm_fp16_flags -mfp16-format=ieee"
-+}
-+
- # Creates a series of routines that return 1 if the given architecture
- # can be selected and a routine to give the flags to select that architecture
- # Note: Extra flags may be added to disable options from newer compilers
-@@ -3209,22 +3356,26 @@ proc check_effective_target_arm_fp16_ok { } {
- # Usage: /* { dg-require-effective-target arm_arch_v5_ok } */
- #        /* { dg-add-options arm_arch_v5 } */
- #	 /* { dg-require-effective-target arm_arch_v5_multilib } */
--foreach { armfunc armflag armdef } { v4 "-march=armv4 -marm" __ARM_ARCH_4__
--				     v4t "-march=armv4t" __ARM_ARCH_4T__
--				     v5 "-march=armv5 -marm" __ARM_ARCH_5__
--				     v5t "-march=armv5t" __ARM_ARCH_5T__
--				     v5te "-march=armv5te" __ARM_ARCH_5TE__
--				     v6 "-march=armv6" __ARM_ARCH_6__
--				     v6k "-march=armv6k" __ARM_ARCH_6K__
--				     v6t2 "-march=armv6t2" __ARM_ARCH_6T2__
--				     v6z "-march=armv6z" __ARM_ARCH_6Z__
--				     v6m "-march=armv6-m -mthumb" __ARM_ARCH_6M__
--				     v7a "-march=armv7-a" __ARM_ARCH_7A__
--				     v7r "-march=armv7-r" __ARM_ARCH_7R__
--				     v7m "-march=armv7-m -mthumb" __ARM_ARCH_7M__
--				     v7em "-march=armv7e-m -mthumb" __ARM_ARCH_7EM__
--				     v8a "-march=armv8-a" __ARM_ARCH_8A__
--				     v8_1a "-march=armv8.1a" __ARM_ARCH_8A__ } {
-+foreach { armfunc armflag armdef } {
-+	v4 "-march=armv4 -marm" __ARM_ARCH_4__
-+	v4t "-march=armv4t" __ARM_ARCH_4T__
-+	v5 "-march=armv5 -marm" __ARM_ARCH_5__
-+	v5t "-march=armv5t" __ARM_ARCH_5T__
-+	v5te "-march=armv5te" __ARM_ARCH_5TE__
-+	v6 "-march=armv6" __ARM_ARCH_6__
-+	v6k "-march=armv6k" __ARM_ARCH_6K__
-+	v6t2 "-march=armv6t2" __ARM_ARCH_6T2__
-+	v6z "-march=armv6z" __ARM_ARCH_6Z__
-+	v6m "-march=armv6-m -mthumb -mfloat-abi=soft" __ARM_ARCH_6M__
-+	v7a "-march=armv7-a" __ARM_ARCH_7A__
-+	v7r "-march=armv7-r" __ARM_ARCH_7R__
-+	v7m "-march=armv7-m -mthumb" __ARM_ARCH_7M__
-+	v7em "-march=armv7e-m -mthumb" __ARM_ARCH_7EM__
-+	v8a "-march=armv8-a" __ARM_ARCH_8A__
-+	v8_1a "-march=armv8.1a" __ARM_ARCH_8A__
-+	v8_2a "-march=armv8.2a" __ARM_ARCH_8A__
-+	v8m_base "-march=armv8-m.base -mthumb -mfloat-abi=soft" __ARM_ARCH_8M_BASE__
-+	v8m_main "-march=armv8-m.main -mthumb" __ARM_ARCH_8M_MAIN__ } {
-     eval [string map [list FUNC $armfunc FLAG $armflag DEF $armdef ] {
- 	proc check_effective_target_arm_arch_FUNC_ok { } {
- 	    if { [ string match "*-marm*" "FLAG" ] &&
-@@ -3274,6 +3425,12 @@ proc add_options_for_arm_arch_v7ve { flags } {
-     return "$flags -march=armv7ve"
- }
- 
-+# Return 1 if GCC was configured with --with-mode=
-+proc check_effective_target_default_mode { } {
-+
-+    return [check_configured_with "with-mode="]
-+}
-+
- # Return 1 if this is an ARM target where -marm causes ARM to be
- # used (not Thumb)
- 
-@@ -3352,15 +3509,60 @@ proc check_effective_target_arm_cortex_m { } {
- 	return 0
-     }
-     return [check_no_compiler_messages arm_cortex_m assembly {
--	#if !defined(__ARM_ARCH_7M__) \
--            && !defined (__ARM_ARCH_7EM__) \
--            && !defined (__ARM_ARCH_6M__)
--	#error !__ARM_ARCH_7M__ && !__ARM_ARCH_7EM__ && !__ARM_ARCH_6M__
-+	#if defined(__ARM_ARCH_ISA_ARM)
-+	#error __ARM_ARCH_ISA_ARM is defined
- 	#endif
- 	int i;
-     } "-mthumb"]
- }
- 
-+# Return 1 if this is an ARM target where -mthumb causes Thumb-1 to be
-+# used and MOVT/MOVW instructions to be available.
-+
-+proc check_effective_target_arm_thumb1_movt_ok {} {
-+    if [check_effective_target_arm_thumb1_ok] {
-+	return [check_no_compiler_messages arm_movt object {
-+	    int
-+	    foo (void)
-+	    {
-+	      asm ("movt r0, #42");
-+	    }
-+	} "-mthumb"]
-+    } else {
-+	return 0
-+    }
-+}
-+
-+# Return 1 if this is an ARM target where -mthumb causes Thumb-1 to be
-+# used and CBZ and CBNZ instructions are available.
-+
-+proc check_effective_target_arm_thumb1_cbz_ok {} {
-+    if [check_effective_target_arm_thumb1_ok] {
-+	return [check_no_compiler_messages arm_movt object {
-+	    int
-+	    foo (void)
-+	    {
-+	      asm ("cbz r0, 2f\n2:");
-+	    }
-+	} "-mthumb"]
-+    } else {
-+	return 0
-+    }
-+}
-+
-+# Return 1 if this is an ARM target where ARMv8-M Security Extensions is
-+# available.
-+
-+proc check_effective_target_arm_cmse_ok {} {
-+    return [check_no_compiler_messages arm_cmse object {
-+	int
-+	foo (void)
-+	{
-+	  asm ("bxns r0");
-+	}
-+    } "-mcmse"];
-+}
-+
- # Return 1 if this compilation turns on string_ops_prefer_neon on.
- 
- proc check_effective_target_arm_tune_string_ops_prefer_neon { } {
-@@ -3436,6 +3638,76 @@ proc check_effective_target_arm_v8_1a_neon_ok { } {
- 		check_effective_target_arm_v8_1a_neon_ok_nocache]
- }
- 
-+# Return 1 if the target supports ARMv8.2 scalar FP16 arithmetic
-+# instructions, 0 otherwise.  The test is valid for ARM and for AArch64.
-+# Record the command line options needed.
-+
-+proc check_effective_target_arm_v8_2a_fp16_scalar_ok_nocache { } {
-+    global et_arm_v8_2a_fp16_scalar_flags
-+    set et_arm_v8_2a_fp16_scalar_flags ""
-+
-+    if { ![istarget arm*-*-*] && ![istarget aarch64*-*-*] } {
-+	return 0;
-+    }
-+
-+    # Iterate through sets of options to find the compiler flags that
-+    # need to be added to the -march option.
-+    foreach flags {"" "-mfpu=fp-armv8" "-mfloat-abi=softfp" \
-+		       "-mfpu=fp-armv8 -mfloat-abi=softfp"} {
-+	if { [check_no_compiler_messages_nocache \
-+		  arm_v8_2a_fp16_scalar_ok object {
-+	    #if !defined (__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
-+	    #error "__ARM_FEATURE_FP16_SCALAR_ARITHMETIC not defined"
-+	    #endif
-+	} "$flags -march=armv8.2-a+fp16"] } {
-+	    set et_arm_v8_2a_fp16_scalar_flags "$flags -march=armv8.2-a+fp16"
-+	    return 1
-+	}
-+    }
-+
-+    return 0;
-+}
-+
-+proc check_effective_target_arm_v8_2a_fp16_scalar_ok { } {
-+    return [check_cached_effective_target arm_v8_2a_fp16_scalar_ok \
-+		check_effective_target_arm_v8_2a_fp16_scalar_ok_nocache]
-+}
-+
-+# Return 1 if the target supports ARMv8.2 Adv.SIMD FP16 arithmetic
-+# instructions, 0 otherwise.  The test is valid for ARM and for AArch64.
-+# Record the command line options needed.
-+
-+proc check_effective_target_arm_v8_2a_fp16_neon_ok_nocache { } {
-+    global et_arm_v8_2a_fp16_neon_flags
-+    set et_arm_v8_2a_fp16_neon_flags ""
-+
-+    if { ![istarget arm*-*-*] && ![istarget aarch64*-*-*] } {
-+	return 0;
-+    }
-+
-+    # Iterate through sets of options to find the compiler flags that
-+    # need to be added to the -march option.
-+    foreach flags {"" "-mfpu=neon-fp-armv8" "-mfloat-abi=softfp" \
-+		       "-mfpu=neon-fp-armv8 -mfloat-abi=softfp"} {
-+	if { [check_no_compiler_messages_nocache \
-+		  arm_v8_2a_fp16_neon_ok object {
-+	    #if !defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-+	    #error "__ARM_FEATURE_FP16_VECTOR_ARITHMETIC not defined"
-+	    #endif
-+	} "$flags -march=armv8.2-a+fp16"] } {
-+	    set et_arm_v8_2a_fp16_neon_flags "$flags -march=armv8.2-a+fp16"
-+	    return 1
-+	}
-+    }
-+
-+    return 0;
-+}
-+
-+proc check_effective_target_arm_v8_2a_fp16_neon_ok { } {
-+    return [check_cached_effective_target arm_v8_2a_fp16_neon_ok \
-+		check_effective_target_arm_v8_2a_fp16_neon_ok_nocache]
-+}
-+
- # Return 1 if the target supports executing ARMv8 NEON instructions, 0
- # otherwise.
- 
-@@ -3445,11 +3717,17 @@ proc check_effective_target_arm_v8_neon_hw { } {
- 	int
- 	main (void)
- 	{
--	  float32x2_t a;
-+	  float32x2_t a = { 1.0f, 2.0f };
-+	  #ifdef __ARM_ARCH_ISA_A64
-+	  asm ("frinta %0.2s, %1.2s"
-+	      : "=w" (a)
-+	      : "w" (a));
-+	  #else
- 	  asm ("vrinta.f32 %P0, %P1"
- 	       : "=w" (a)
- 	       : "0" (a));
--	  return 0;
-+	  #endif
-+	  return a[0] == 2.0f;
- 	}
-     } [add_options_for_arm_v8_neon ""]]
- }
-@@ -3492,6 +3770,81 @@ proc check_effective_target_arm_v8_1a_neon_hw { } {
-     } [add_options_for_arm_v8_1a_neon ""]]
- }
- 
-+# Return 1 if the target supports executing floating point instructions from
-+# ARMv8.2 with the FP16 extension, 0 otherwise.  The test is valid for ARM and
-+# for AArch64.
-+
-+proc check_effective_target_arm_v8_2a_fp16_scalar_hw { } {
-+    if { ![check_effective_target_arm_v8_2a_fp16_scalar_ok] } {
-+	return 0;
-+    }
-+    return [check_runtime arm_v8_2a_fp16_scalar_hw_available {
-+	int
-+	main (void)
-+	{
-+	  __fp16 a = 1.0;
-+	  __fp16 result;
-+
-+	  #ifdef __ARM_ARCH_ISA_A64
-+
-+	  asm ("fabs %h0, %h1"
-+	       : "=w"(result)
-+	       : "w"(a)
-+	       : /* No clobbers.  */);
-+
-+	  #else
-+
-+	  asm ("vabs.f16 %0, %1"
-+	       : "=w"(result)
-+	       : "w"(a)
-+	       : /* No clobbers.  */);
-+
-+	  #endif
-+
-+	  return (result == 1.0) ? 0 : 1;
-+	}
-+    } [add_options_for_arm_v8_2a_fp16_scalar ""]]
-+}
-+
-+# Return 1 if the target supports executing Adv.SIMD instructions from ARMv8.2
-+# with the FP16 extension, 0 otherwise.  The test is valid for ARM and for
-+# AArch64.
-+
-+proc check_effective_target_arm_v8_2a_fp16_neon_hw { } {
-+    if { ![check_effective_target_arm_v8_2a_fp16_neon_ok] } {
-+	return 0;
-+    }
-+    return [check_runtime arm_v8_2a_fp16_neon_hw_available {
-+	int
-+	main (void)
-+	{
-+	  #ifdef __ARM_ARCH_ISA_A64
-+
-+	  __Float16x4_t a = {1.0, -1.0, 1.0, -1.0};
-+	  __Float16x4_t result;
-+
-+	  asm ("fabs %0.4h, %1.4h"
-+	       : "=w"(result)
-+	       : "w"(a)
-+	       : /* No clobbers.  */);
-+
-+	  #else
-+
-+	  __simd64_float16_t a = {1.0, -1.0, 1.0, -1.0};
-+	  __simd64_float16_t result;
-+
-+	  asm ("vabs.f16 %P0, %P1"
-+	       : "=w"(result)
-+	       : "w"(a)
-+	       : /* No clobbers.  */);
-+
-+	  #endif
-+
-+	  return (result[0] == 1.0) ? 0 : 1;
-+	}
-+    } [add_options_for_arm_v8_2a_fp16_neon ""]]
-+}
-+
- # Return 1 if this is a ARM target with NEON enabled.
- 
- proc check_effective_target_arm_neon { } {
-@@ -3526,6 +3879,25 @@ proc check_effective_target_arm_neonv2 { } {
-     }
- }
- 
-+# Return 1 if this is an ARM target with load acquire and store release
-+# instructions for 8-, 16- and 32-bit types.
-+
-+proc check_effective_target_arm_acq_rel { } {
-+    return [check_no_compiler_messages arm_acq_rel object {
-+	void
-+	load_acquire_store_release (void)
-+	{
-+	  asm ("lda r0, [r1]\n\t"
-+	       "stl r0, [r1]\n\t"
-+	       "ldah r0, [r1]\n\t"
-+	       "stlh r0, [r1]\n\t"
-+	       "ldab r0, [r1]\n\t"
-+	       "stlb r0, [r1]"
-+	       : : : "r0", "memory");
-+	}
-+    }]
-+}
-+
- # Return 1 if this a Loongson-2E or -2F target using an ABI that supports
- # the Loongson vector modes.
- 
-@@ -4380,6 +4752,8 @@ proc check_effective_target_vect_widen_sum_hi_to_si_pattern { } {
-         set et_vect_widen_sum_hi_to_si_pattern_saved 0
-         if { [istarget powerpc*-*-*]
-              || [istarget aarch64*-*-*]
-+	     || ([istarget arm*-*-*] &&
-+		 [check_effective_target_arm_neon_ok])
-              || [istarget ia64-*-*] } {
-             set et_vect_widen_sum_hi_to_si_pattern_saved 1
-         }
-@@ -5755,6 +6129,8 @@ proc check_effective_target_sync_int_long { } {
- 	     || [istarget aarch64*-*-*]
- 	     || [istarget alpha*-*-*] 
- 	     || [istarget arm*-*-linux-*] 
-+	     || ([istarget arm*-*-*]
-+		 && [check_effective_target_arm_acq_rel])
- 	     || [istarget bfin*-*linux*]
- 	     || [istarget hppa*-*linux*]
- 	     || [istarget s390*-*-*] 
-@@ -5788,6 +6164,8 @@ proc check_effective_target_sync_char_short { } {
- 	     || [istarget i?86-*-*] || [istarget x86_64-*-*]
- 	     || [istarget alpha*-*-*] 
- 	     || [istarget arm*-*-linux-*] 
-+	     || ([istarget arm*-*-*]
-+		 && [check_effective_target_arm_acq_rel])
- 	     || [istarget hppa*-*linux*]
- 	     || [istarget s390*-*-*] 
- 	     || [istarget powerpc*-*-*]
---- a/src/gcc/tree-inline.c
-+++ b/src/gcc/tree-inline.c
-@@ -244,6 +244,7 @@ remap_ssa_name (tree name, copy_body_data *id)
-       /* At least IPA points-to info can be directly transferred.  */
-       if (id->src_cfun->gimple_df
- 	  && id->src_cfun->gimple_df->ipa_pta
-+	  && POINTER_TYPE_P (TREE_TYPE (name))
- 	  && (pi = SSA_NAME_PTR_INFO (name))
- 	  && !pi->pt.anything)
- 	{
-@@ -276,6 +277,7 @@ remap_ssa_name (tree name, copy_body_data *id)
-       /* At least IPA points-to info can be directly transferred.  */
-       if (id->src_cfun->gimple_df
- 	  && id->src_cfun->gimple_df->ipa_pta
-+	  && POINTER_TYPE_P (TREE_TYPE (name))
- 	  && (pi = SSA_NAME_PTR_INFO (name))
- 	  && !pi->pt.anything)
- 	{
---- a/src/gcc/tree-scalar-evolution.c
-+++ b/src/gcc/tree-scalar-evolution.c
-@@ -1937,6 +1937,36 @@ interpret_rhs_expr (struct loop *loop, gimple *at_stmt,
-       res = chrec_convert (type, chrec1, at_stmt);
-       break;
- 
-+    case BIT_AND_EXPR:
-+      /* Given int variable A, handle A&0xffff as (int)(unsigned short)A.
-+	 If A is SCEV and its value is in the range of representable set
-+	 of type unsigned short, the result expression is a (no-overflow)
-+	 SCEV.  */
-+      res = chrec_dont_know;
-+      if (tree_fits_uhwi_p (rhs2))
-+	{
-+	  int precision;
-+	  unsigned HOST_WIDE_INT val = tree_to_uhwi (rhs2);
-+
-+	  val ++;
-+	  /* Skip if value of rhs2 wraps in unsigned HOST_WIDE_INT or
-+	     it's not the maximum value of a smaller type than rhs1.  */
-+	  if (val != 0
-+	      && (precision = exact_log2 (val)) > 0
-+	      && (unsigned) precision < TYPE_PRECISION (TREE_TYPE (rhs1)))
-+	    {
-+	      tree utype = build_nonstandard_integer_type (precision, 1);
-+
-+	      if (TYPE_PRECISION (utype) < TYPE_PRECISION (TREE_TYPE (rhs1)))
-+		{
-+		  chrec1 = analyze_scalar_evolution (loop, rhs1);
-+		  chrec1 = chrec_convert (utype, chrec1, at_stmt);
-+		  res = chrec_convert (TREE_TYPE (rhs1), chrec1, at_stmt);
-+		}
-+	    }
-+	}
-+      break;
-+
-     default:
-       res = chrec_dont_know;
-       break;
---- a/src/gcc/tree-ssa-address.c
-+++ b/src/gcc/tree-ssa-address.c
-@@ -877,6 +877,10 @@ copy_ref_info (tree new_ref, tree old_ref)
- 	      && TREE_CODE (old_ref) == MEM_REF
- 	      && !(TREE_CODE (new_ref) == TARGET_MEM_REF
- 		   && (TMR_INDEX2 (new_ref)
-+		       /* TODO: Below conditions can be relaxed if TMR_INDEX
-+			  is an indcution variable and its initial value and
-+			  step are aligned.  */
-+		       || (TMR_INDEX (new_ref) && !TMR_STEP (new_ref))
- 		       || (TMR_STEP (new_ref)
- 			   && (TREE_INT_CST_LOW (TMR_STEP (new_ref))
- 			       < align)))))
---- a/src/gcc/tree-ssa-ccp.c
-+++ b/src/gcc/tree-ssa-ccp.c
-@@ -229,13 +229,12 @@ debug_lattice_value (ccp_prop_value_t val)
-   fprintf (stderr, "\n");
- }
- 
--/* Extend NONZERO_BITS to a full mask, with the upper bits being set.  */
-+/* Extend NONZERO_BITS to a full mask, based on sgn.  */ 
- 
- static widest_int
--extend_mask (const wide_int &nonzero_bits)
-+extend_mask (const wide_int &nonzero_bits, signop sgn)
- {
--  return (wi::mask <widest_int> (wi::get_precision (nonzero_bits), true)
--	  | widest_int::from (nonzero_bits, UNSIGNED));
-+  return widest_int::from (nonzero_bits, sgn); 
- }
- 
- /* Compute a default value for variable VAR and store it in the
-@@ -284,7 +283,7 @@ get_default_value (tree var)
- 		{
- 		  val.lattice_val = CONSTANT;
- 		  val.value = build_zero_cst (TREE_TYPE (var));
--		  val.mask = extend_mask (nonzero_bits);
-+		  val.mask = extend_mask (nonzero_bits, TYPE_SIGN (TREE_TYPE (var)));
- 		}
- 	    }
- 	}
-@@ -1939,7 +1938,7 @@ evaluate_stmt (gimple *stmt)
- 	    {
- 	      val.lattice_val = CONSTANT;
- 	      val.value = build_zero_cst (TREE_TYPE (lhs));
--	      val.mask = extend_mask (nonzero_bits);
-+	      val.mask = extend_mask (nonzero_bits, TYPE_SIGN (TREE_TYPE (lhs)));
- 	      is_constant = true;
- 	    }
- 	  else
-@@ -1950,7 +1949,8 @@ evaluate_stmt (gimple *stmt)
- 	      if (nonzero_bits == 0)
- 		val.mask = 0;
- 	      else
--		val.mask = val.mask & extend_mask (nonzero_bits);
-+		val.mask = val.mask & extend_mask (nonzero_bits,
-+						   TYPE_SIGN (TREE_TYPE (lhs)));
- 	    }
- 	}
-     }
---- a/src/gcc/tree-ssa-strlen.c
-+++ b/src/gcc/tree-ssa-strlen.c
-@@ -2263,7 +2263,7 @@ public:
- };
- 
- /* Callback for walk_dominator_tree.  Attempt to optimize various
--   string ops by remembering string lenths pointed by pointer SSA_NAMEs.  */
-+   string ops by remembering string lengths pointed by pointer SSA_NAMEs.  */
- 
- edge
- strlen_dom_walker::before_dom_children (basic_block bb)
---- a/src/gcc/tree-vect-data-refs.c
-+++ b/src/gcc/tree-vect-data-refs.c
-@@ -2250,6 +2250,7 @@ vect_analyze_group_access_1 (struct data_reference *dr)
- 	{
- 	  GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
- 	  GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
-+	  GROUP_GAP (stmt_info) = groupsize - 1;
- 	  if (dump_enabled_p ())
- 	    {
- 	      dump_printf_loc (MSG_NOTE, vect_location,
---- a/src/gcc/tree-vect-loop-manip.c
-+++ b/src/gcc/tree-vect-loop-manip.c
-@@ -40,6 +40,7 @@ along with GCC; see the file COPYING3.  If not see
- #include "cfgloop.h"
- #include "tree-scalar-evolution.h"
- #include "tree-vectorizer.h"
-+#include "tree-ssa-loop-ivopts.h"
- 
- /*************************************************************************
-   Simple Loop Peeling Utilities
-@@ -1594,10 +1595,26 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
-         }
- 
-       /* FORNOW: We do not transform initial conditions of IVs
-+	 which evolution functions are not invariants in the loop.  */
-+
-+      if (!expr_invariant_in_loop_p (loop, evolution_part))
-+	{
-+	  if (dump_enabled_p ())
-+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-+			     "evolution not invariant in loop.\n");
-+	  return false;
-+	}
-+
-+      /* FORNOW: We do not transform initial conditions of IVs
- 	 which evolution functions are a polynomial of degree >= 2.  */
- 
-       if (tree_is_chrec (evolution_part))
--	return false;
-+	{
-+	  if (dump_enabled_p ())
-+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-+			     "evolution is chrec.\n");
-+	  return false;
-+	}
-     }
- 
-   return true;
---- a/src/gcc/tree-vect-patterns.c
-+++ b/src/gcc/tree-vect-patterns.c
-@@ -2136,32 +2136,313 @@ vect_recog_vector_vector_shift_pattern (vec<gimple *> *stmts,
-   return pattern_stmt;
- }
- 
--/* Detect multiplication by constant which are postive or negatives of power 2,
--   and convert them to shift patterns.
-+/* Return true iff the target has a vector optab implementing the operation
-+   CODE on type VECTYPE.  */
- 
--   Mult with constants that are postive power of two.
--   type a_t;
--   type b_t
--   S1: b_t = a_t * n
-+static bool
-+target_has_vecop_for_code (tree_code code, tree vectype)
-+{
-+  optab voptab = optab_for_tree_code (code, vectype, optab_vector);
-+  return voptab
-+	 && optab_handler (voptab, TYPE_MODE (vectype)) != CODE_FOR_nothing;
-+}
- 
--   or
-+/* Verify that the target has optabs of VECTYPE to perform all the steps
-+   needed by the multiplication-by-immediate synthesis algorithm described by
-+   ALG and VAR.  If SYNTH_SHIFT_P is true ensure that vector addition is
-+   present.  Return true iff the target supports all the steps.  */
-+
-+static bool
-+target_supports_mult_synth_alg (struct algorithm *alg, mult_variant var,
-+				 tree vectype, bool synth_shift_p)
-+{
-+  if (alg->op[0] != alg_zero && alg->op[0] != alg_m)
-+    return false;
-+
-+  bool supports_vminus = target_has_vecop_for_code (MINUS_EXPR, vectype);
-+  bool supports_vplus = target_has_vecop_for_code (PLUS_EXPR, vectype);
-+
-+  if (var == negate_variant
-+      && !target_has_vecop_for_code (NEGATE_EXPR, vectype))
-+    return false;
-+
-+  /* If we must synthesize shifts with additions make sure that vector
-+     addition is available.  */
-+  if ((var == add_variant || synth_shift_p) && !supports_vplus)
-+    return false;
-+
-+  for (int i = 1; i < alg->ops; i++)
-+    {
-+      switch (alg->op[i])
-+	{
-+	case alg_shift:
-+	  break;
-+	case alg_add_t_m2:
-+	case alg_add_t2_m:
-+	case alg_add_factor:
-+	  if (!supports_vplus)
-+	    return false;
-+	  break;
-+	case alg_sub_t_m2:
-+	case alg_sub_t2_m:
-+	case alg_sub_factor:
-+	  if (!supports_vminus)
-+	    return false;
-+	  break;
-+	case alg_unknown:
-+	case alg_m:
-+	case alg_zero:
-+	case alg_impossible:
-+	  return false;
-+	default:
-+	  gcc_unreachable ();
-+	}
-+    }
-+
-+  return true;
-+}
-+
-+/* Synthesize a left shift of OP by AMNT bits using a series of additions and
-+   putting the final result in DEST.  Append all statements but the last into
-+   VINFO.  Return the last statement.  */
-+
-+static gimple *
-+synth_lshift_by_additions (tree dest, tree op, HOST_WIDE_INT amnt,
-+			   stmt_vec_info vinfo)
-+{
-+  HOST_WIDE_INT i;
-+  tree itype = TREE_TYPE (op);
-+  tree prev_res = op;
-+  gcc_assert (amnt >= 0);
-+  for (i = 0; i < amnt; i++)
-+    {
-+      tree tmp_var = (i < amnt - 1) ? vect_recog_temp_ssa_var (itype, NULL)
-+		      : dest;
-+      gimple *stmt
-+        = gimple_build_assign (tmp_var, PLUS_EXPR, prev_res, prev_res);
-+      prev_res = tmp_var;
-+      if (i < amnt - 1)
-+	append_pattern_def_seq (vinfo, stmt);
-+      else
-+	return stmt;
-+    }
-+  gcc_unreachable ();
-+  return NULL;
-+}
-+
-+/* Helper for vect_synth_mult_by_constant.  Apply a binary operation
-+   CODE to operands OP1 and OP2, creating a new temporary SSA var in
-+   the process if necessary.  Append the resulting assignment statements
-+   to the sequence in STMT_VINFO.  Return the SSA variable that holds the
-+   result of the binary operation.  If SYNTH_SHIFT_P is true synthesize
-+   left shifts using additions.  */
-+
-+static tree
-+apply_binop_and_append_stmt (tree_code code, tree op1, tree op2,
-+			     stmt_vec_info stmt_vinfo, bool synth_shift_p)
-+{
-+  if (integer_zerop (op2)
-+      && (code == LSHIFT_EXPR
-+	  || code == PLUS_EXPR))
-+    {
-+      gcc_assert (TREE_CODE (op1) == SSA_NAME);
-+      return op1;
-+    }
-+
-+  gimple *stmt;
-+  tree itype = TREE_TYPE (op1);
-+  tree tmp_var = vect_recog_temp_ssa_var (itype, NULL);
-+
-+  if (code == LSHIFT_EXPR
-+      && synth_shift_p)
-+    {
-+      stmt = synth_lshift_by_additions (tmp_var, op1, TREE_INT_CST_LOW (op2),
-+					 stmt_vinfo);
-+      append_pattern_def_seq (stmt_vinfo, stmt);
-+      return tmp_var;
-+    }
-+
-+  stmt = gimple_build_assign (tmp_var, code, op1, op2);
-+  append_pattern_def_seq (stmt_vinfo, stmt);
-+  return tmp_var;
-+}
-+
-+/* Synthesize a multiplication of OP by an INTEGER_CST VAL using shifts
-+   and simple arithmetic operations to be vectorized.  Record the statements
-+   produced in STMT_VINFO and return the last statement in the sequence or
-+   NULL if it's not possible to synthesize such a multiplication.
-+   This function mirrors the behavior of expand_mult_const in expmed.c but
-+   works on tree-ssa form.  */
-+
-+static gimple *
-+vect_synth_mult_by_constant (tree op, tree val,
-+			     stmt_vec_info stmt_vinfo)
-+{
-+  tree itype = TREE_TYPE (op);
-+  machine_mode mode = TYPE_MODE (itype);
-+  struct algorithm alg;
-+  mult_variant variant;
-+  if (!tree_fits_shwi_p (val))
-+    return NULL;
-+
-+  /* Multiplication synthesis by shifts, adds and subs can introduce
-+     signed overflow where the original operation didn't.  Perform the
-+     operations on an unsigned type and cast back to avoid this.
-+     In the future we may want to relax this for synthesis algorithms
-+     that we can prove do not cause unexpected overflow.  */
-+  bool cast_to_unsigned_p = !TYPE_OVERFLOW_WRAPS (itype);
-+
-+  tree multtype = cast_to_unsigned_p ? unsigned_type_for (itype) : itype;
-+
-+  /* Targets that don't support vector shifts but support vector additions
-+     can synthesize shifts that way.  */
-+  bool synth_shift_p = !vect_supportable_shift (LSHIFT_EXPR, multtype);
-+
-+  HOST_WIDE_INT hwval = tree_to_shwi (val);
-+  /* Use MAX_COST here as we don't want to limit the sequence on rtx costs.
-+     The vectorizer's benefit analysis will decide whether it's beneficial
-+     to do this.  */
-+  bool possible = choose_mult_variant (mode, hwval, &alg,
-+					&variant, MAX_COST);
-+  if (!possible)
-+    return NULL;
- 
--   Mult with constants that are negative power of two.
--   S2: b_t = a_t * -n
-+  tree vectype = get_vectype_for_scalar_type (multtype);
-+
-+  if (!vectype
-+      || !target_supports_mult_synth_alg (&alg, variant,
-+					   vectype, synth_shift_p))
-+    return NULL;
-+
-+  tree accumulator;
-+
-+  /* Clear out the sequence of statements so we can populate it below.  */
-+  STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) = NULL;
-+  gimple *stmt = NULL;
-+
-+  if (cast_to_unsigned_p)
-+    {
-+      tree tmp_op = vect_recog_temp_ssa_var (multtype, NULL);
-+      stmt = gimple_build_assign (tmp_op, CONVERT_EXPR, op);
-+      append_pattern_def_seq (stmt_vinfo, stmt);
-+      op = tmp_op;
-+    }
-+
-+  if (alg.op[0] == alg_zero)
-+    accumulator = build_int_cst (multtype, 0);
-+  else
-+    accumulator = op;
-+
-+  bool needs_fixup = (variant == negate_variant)
-+		      || (variant == add_variant);
-+
-+  for (int i = 1; i < alg.ops; i++)
-+    {
-+      tree shft_log = build_int_cst (multtype, alg.log[i]);
-+      tree accum_tmp = vect_recog_temp_ssa_var (multtype, NULL);
-+      tree tmp_var = NULL_TREE;
-+
-+      switch (alg.op[i])
-+	{
-+	case alg_shift:
-+	  if (synth_shift_p)
-+	    stmt
-+	      = synth_lshift_by_additions (accum_tmp, accumulator, alg.log[i],
-+					    stmt_vinfo);
-+	  else
-+	    stmt = gimple_build_assign (accum_tmp, LSHIFT_EXPR, accumulator,
-+					 shft_log);
-+	  break;
-+	case alg_add_t_m2:
-+	  tmp_var
-+	    = apply_binop_and_append_stmt (LSHIFT_EXPR, op, shft_log,
-+					    stmt_vinfo, synth_shift_p);
-+	  stmt = gimple_build_assign (accum_tmp, PLUS_EXPR, accumulator,
-+				       tmp_var);
-+	  break;
-+	case alg_sub_t_m2:
-+	  tmp_var = apply_binop_and_append_stmt (LSHIFT_EXPR, op,
-+						  shft_log, stmt_vinfo,
-+						  synth_shift_p);
-+	  /* In some algorithms the first step involves zeroing the
-+	     accumulator.  If subtracting from such an accumulator
-+	     just emit the negation directly.  */
-+	  if (integer_zerop (accumulator))
-+	    stmt = gimple_build_assign (accum_tmp, NEGATE_EXPR, tmp_var);
-+	  else
-+	    stmt = gimple_build_assign (accum_tmp, MINUS_EXPR, accumulator,
-+					tmp_var);
-+	  break;
-+	case alg_add_t2_m:
-+	  tmp_var
-+	    = apply_binop_and_append_stmt (LSHIFT_EXPR, accumulator, shft_log,
-+					   stmt_vinfo, synth_shift_p);
-+	  stmt = gimple_build_assign (accum_tmp, PLUS_EXPR, tmp_var, op);
-+	  break;
-+	case alg_sub_t2_m:
-+	  tmp_var
-+	    = apply_binop_and_append_stmt (LSHIFT_EXPR, accumulator, shft_log,
-+					   stmt_vinfo, synth_shift_p);
-+	  stmt = gimple_build_assign (accum_tmp, MINUS_EXPR, tmp_var, op);
-+	  break;
-+	case alg_add_factor:
-+	  tmp_var
-+	    = apply_binop_and_append_stmt (LSHIFT_EXPR, accumulator, shft_log,
-+					    stmt_vinfo, synth_shift_p);
-+	  stmt = gimple_build_assign (accum_tmp, PLUS_EXPR, accumulator,
-+				       tmp_var);
-+	  break;
-+	case alg_sub_factor:
-+	  tmp_var
-+	    = apply_binop_and_append_stmt (LSHIFT_EXPR, accumulator, shft_log,
-+					   stmt_vinfo, synth_shift_p);
-+	  stmt = gimple_build_assign (accum_tmp, MINUS_EXPR, tmp_var,
-+				      accumulator);
-+	  break;
-+	default:
-+	  gcc_unreachable ();
-+	}
-+      /* We don't want to append the last stmt in the sequence to stmt_vinfo
-+	 but rather return it directly.  */
-+
-+      if ((i < alg.ops - 1) || needs_fixup || cast_to_unsigned_p)
-+	append_pattern_def_seq (stmt_vinfo, stmt);
-+      accumulator = accum_tmp;
-+    }
-+  if (variant == negate_variant)
-+    {
-+      tree accum_tmp = vect_recog_temp_ssa_var (multtype, NULL);
-+      stmt = gimple_build_assign (accum_tmp, NEGATE_EXPR, accumulator);
-+      accumulator = accum_tmp;
-+      if (cast_to_unsigned_p)
-+	append_pattern_def_seq (stmt_vinfo, stmt);
-+    }
-+  else if (variant == add_variant)
-+    {
-+      tree accum_tmp = vect_recog_temp_ssa_var (multtype, NULL);
-+      stmt = gimple_build_assign (accum_tmp, PLUS_EXPR, accumulator, op);
-+      accumulator = accum_tmp;
-+      if (cast_to_unsigned_p)
-+	append_pattern_def_seq (stmt_vinfo, stmt);
-+    }
-+  /* Move back to a signed if needed.  */
-+  if (cast_to_unsigned_p)
-+    {
-+      tree accum_tmp = vect_recog_temp_ssa_var (itype, NULL);
-+      stmt = gimple_build_assign (accum_tmp, CONVERT_EXPR, accumulator);
-+    }
-+
-+  return stmt;
-+}
-+
-+/* Detect multiplication by constant and convert it into a sequence of
-+   shifts and additions, subtractions, negations.  We reuse the
-+   choose_mult_variant algorithms from expmed.c
- 
-    Input/Output:
- 
-    STMTS: Contains a stmt from which the pattern search begins,
--   i.e. the mult stmt.  Convert the mult operation to LSHIFT if
--   constant operand is a power of 2.
--   type a_t, b_t
--   S1': b_t = a_t << log2 (n)
--
--   Convert the mult operation to LSHIFT and followed by a NEGATE
--   if constant operand is a negative power of 2.
--   type a_t, b_t, res_T;
--   S2': b_t = a_t << log2 (n)
--   S3': res_T  = - (b_t)
-+   i.e. the mult stmt.
- 
-  Output:
- 
-@@ -2169,8 +2450,8 @@ vect_recog_vector_vector_shift_pattern (vec<gimple *> *stmts,
- 
-   * TYPE_OUT: The type of the output of this pattern.
- 
--  * Return value: A new stmt that will be used to replace the multiplication
--    S1 or S2 stmt.  */
-+  * Return value: A new stmt that will be used to replace
-+    the multiplication.  */
- 
- static gimple *
- vect_recog_mult_pattern (vec<gimple *> *stmts,
-@@ -2178,11 +2459,8 @@ vect_recog_mult_pattern (vec<gimple *> *stmts,
- {
-   gimple *last_stmt = stmts->pop ();
-   tree oprnd0, oprnd1, vectype, itype;
--  gimple *pattern_stmt, *def_stmt;
--  optab optab;
-+  gimple *pattern_stmt;
-   stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt);
--  int power2_val, power2_neg_val;
--  tree shift;
- 
-   if (!is_gimple_assign (last_stmt))
-     return NULL;
-@@ -2206,52 +2484,17 @@ vect_recog_mult_pattern (vec<gimple *> *stmts,
- 
-   /* If the target can handle vectorized multiplication natively,
-      don't attempt to optimize this.  */
--  optab = optab_for_tree_code (MULT_EXPR, vectype, optab_default);
--  if (optab != unknown_optab)
-+  optab mul_optab = optab_for_tree_code (MULT_EXPR, vectype, optab_default);
-+  if (mul_optab != unknown_optab)
-     {
-       machine_mode vec_mode = TYPE_MODE (vectype);
--      int icode = (int) optab_handler (optab, vec_mode);
-+      int icode = (int) optab_handler (mul_optab, vec_mode);
-       if (icode != CODE_FOR_nothing)
--	return NULL;
-+       return NULL;
-     }
- 
--  /* If target cannot handle vector left shift then we cannot
--     optimize and bail out.  */
--  optab = optab_for_tree_code (LSHIFT_EXPR, vectype, optab_vector);
--  if (!optab
--      || optab_handler (optab, TYPE_MODE (vectype)) == CODE_FOR_nothing)
--    return NULL;
--
--  power2_val = wi::exact_log2 (oprnd1);
--  power2_neg_val = wi::exact_log2 (wi::neg (oprnd1));
--
--  /* Handle constant operands that are postive or negative powers of 2.  */
--  if (power2_val != -1)
--    {
--      shift = build_int_cst (itype, power2_val);
--      pattern_stmt
--	= gimple_build_assign (vect_recog_temp_ssa_var (itype, NULL),
--			       LSHIFT_EXPR, oprnd0, shift);
--    }
--  else if (power2_neg_val != -1)
--    {
--      /* If the target cannot handle vector NEGATE then we cannot
--	 do the optimization.  */
--      optab = optab_for_tree_code (NEGATE_EXPR, vectype, optab_vector);
--      if (!optab
--	  || optab_handler (optab, TYPE_MODE (vectype)) == CODE_FOR_nothing)
--	return NULL;
--
--      shift = build_int_cst (itype, power2_neg_val);
--      def_stmt
--	= gimple_build_assign (vect_recog_temp_ssa_var (itype, NULL),
--			       LSHIFT_EXPR, oprnd0, shift);
--      new_pattern_def_seq (stmt_vinfo, def_stmt);
--      pattern_stmt
--	 = gimple_build_assign (vect_recog_temp_ssa_var (itype, NULL),
--				NEGATE_EXPR, gimple_assign_lhs (def_stmt));
--    }
--  else
-+  pattern_stmt = vect_synth_mult_by_constant (oprnd0, oprnd1, stmt_vinfo);
-+  if (!pattern_stmt)
-     return NULL;
- 
-   /* Pattern detected.  */
---- a/src/gcc/tree-vect-stmts.c
-+++ b/src/gcc/tree-vect-stmts.c
-@@ -6354,12 +6354,22 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
-       gcc_assert (!nested_in_vect_loop && !STMT_VINFO_GATHER_SCATTER_P (stmt_info));
- 
-       first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
-+      group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
-+
-+      if (!slp
-+	  && !PURE_SLP_STMT (stmt_info)
-+	  && !STMT_VINFO_STRIDED_P (stmt_info))
-+	{
-+	  if (vect_load_lanes_supported (vectype, group_size))
-+	    load_lanes_p = true;
-+	  else if (!vect_grouped_load_supported (vectype, group_size))
-+	    return false;
-+	}
- 
-       /* If this is single-element interleaving with an element distance
-          that leaves unused vector loads around punt - we at least create
- 	 very sub-optimal code in that case (and blow up memory,
- 	 see PR65518).  */
--      bool force_peeling = false;
-       if (first_stmt == stmt
- 	  && !GROUP_NEXT_ELEMENT (stmt_info))
- 	{
-@@ -6373,7 +6383,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
- 	    }
- 
- 	  /* Single-element interleaving requires peeling for gaps.  */
--	  force_peeling = true;
-+	  gcc_assert (GROUP_GAP (stmt_info));
- 	}
- 
-       /* If there is a gap in the end of the group or the group size cannot
-@@ -6381,9 +6391,8 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
- 	 elements in the last iteration and thus need to peel that off.  */
-       if (loop_vinfo
- 	  && ! STMT_VINFO_STRIDED_P (stmt_info)
--	  && (force_peeling
--	      || GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0
--	      || (!slp && vf % GROUP_SIZE (vinfo_for_stmt (first_stmt)) != 0)))
-+	  && (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0
-+	      || (!slp && !load_lanes_p && vf % group_size != 0)))
- 	{
- 	  if (dump_enabled_p ())
- 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-@@ -6403,8 +6412,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
-       if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
- 	slp_perm = true;
- 
--      group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
--
-       /* ???  The following is overly pessimistic (as well as the loop
-          case above) in the case we can statically determine the excess
- 	 elements loaded are within the bounds of a decl that is accessed.
-@@ -6417,16 +6424,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
- 	  return false;
- 	}
- 
--      if (!slp
--	  && !PURE_SLP_STMT (stmt_info)
--	  && !STMT_VINFO_STRIDED_P (stmt_info))
--	{
--	  if (vect_load_lanes_supported (vectype, group_size))
--	    load_lanes_p = true;
--	  else if (!vect_grouped_load_supported (vectype, group_size))
--	    return false;
--	}
--
-       /* Invalidate assumptions made by dependence analysis when vectorization
- 	 on the unrolled body effectively re-orders stmts.  */
-       if (!PURE_SLP_STMT (stmt_info)
---- a/src/gcc/tree-vectorizer.c
-+++ b/src/gcc/tree-vectorizer.c
-@@ -794,38 +794,142 @@ make_pass_slp_vectorize (gcc::context *ctxt)
-      This should involve global alignment analysis and in the future also
-      array padding.  */
- 
-+static unsigned get_vec_alignment_for_type (tree);
-+static hash_map<tree, unsigned> *type_align_map;
-+
-+/* Return alignment of array's vector type corresponding to scalar type.
-+   0 if no vector type exists.  */
-+static unsigned
-+get_vec_alignment_for_array_type (tree type) 
-+{
-+  gcc_assert (TREE_CODE (type) == ARRAY_TYPE);
-+
-+  tree vectype = get_vectype_for_scalar_type (strip_array_types (type));
-+  if (!vectype
-+      || !TYPE_SIZE (type)
-+      || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST
-+      || tree_int_cst_lt (TYPE_SIZE (type), TYPE_SIZE (vectype)))
-+    return 0;
-+
-+  return TYPE_ALIGN (vectype);
-+}
-+
-+/* Return alignment of field having maximum alignment of vector type
-+   corresponding to it's scalar type. For now, we only consider fields whose
-+   offset is a multiple of it's vector alignment.
-+   0 if no suitable field is found.  */
-+static unsigned
-+get_vec_alignment_for_record_type (tree type) 
-+{
-+  gcc_assert (TREE_CODE (type) == RECORD_TYPE);
-+
-+  unsigned max_align = 0, alignment;
-+  HOST_WIDE_INT offset;
-+  tree offset_tree;
-+
-+  if (TYPE_PACKED (type))
-+    return 0;
-+
-+  unsigned *slot = type_align_map->get (type);
-+  if (slot)
-+    return *slot;
-+
-+  for (tree field = first_field (type);
-+       field != NULL_TREE;
-+       field = DECL_CHAIN (field))
-+    {
-+      /* Skip if not FIELD_DECL or if alignment is set by user.  */ 
-+      if (TREE_CODE (field) != FIELD_DECL
-+	  || DECL_USER_ALIGN (field)
-+	  || DECL_ARTIFICIAL (field))
-+	continue;
-+
-+      /* We don't need to process the type further if offset is variable,
-+	 since the offsets of remaining members will also be variable.  */
-+      if (TREE_CODE (DECL_FIELD_OFFSET (field)) != INTEGER_CST
-+	  || TREE_CODE (DECL_FIELD_BIT_OFFSET (field)) != INTEGER_CST)
-+	break;
-+
-+      /* Similarly stop processing the type if offset_tree
-+	 does not fit in unsigned HOST_WIDE_INT.  */
-+      offset_tree = bit_position (field);
-+      if (!tree_fits_uhwi_p (offset_tree))
-+	break;
-+
-+      offset = tree_to_uhwi (offset_tree); 
-+      alignment = get_vec_alignment_for_type (TREE_TYPE (field));
-+
-+      /* Get maximum alignment of vectorized field/array among those members
-+	 whose offset is multiple of the vector alignment.  */ 
-+      if (alignment
-+	  && (offset % alignment == 0)
-+	  && (alignment > max_align))
-+	max_align = alignment;
-+    }
-+
-+  type_align_map->put (type, max_align);
-+  return max_align;
-+}
-+
-+/* Return alignment of vector type corresponding to decl's scalar type
-+   or 0 if it doesn't exist or the vector alignment is lesser than
-+   decl's alignment.  */
-+static unsigned
-+get_vec_alignment_for_type (tree type)
-+{
-+  if (type == NULL_TREE)
-+    return 0;
-+
-+  gcc_assert (TYPE_P (type));
-+
-+  static unsigned alignment = 0;
-+  switch (TREE_CODE (type))
-+    {
-+      case ARRAY_TYPE:
-+	alignment = get_vec_alignment_for_array_type (type);
-+	break;
-+      case RECORD_TYPE:
-+	alignment = get_vec_alignment_for_record_type (type);
-+	break;
-+      default:
-+	alignment = 0;
-+	break;
-+    }
-+
-+  return (alignment > TYPE_ALIGN (type)) ? alignment : 0;
-+}
-+
-+/* Entry point to increase_alignment pass.  */
- static unsigned int
- increase_alignment (void)
- {
-   varpool_node *vnode;
- 
-   vect_location = UNKNOWN_LOCATION;
-+  type_align_map = new hash_map<tree, unsigned>;
- 
-   /* Increase the alignment of all global arrays for vectorization.  */
-   FOR_EACH_DEFINED_VARIABLE (vnode)
-     {
--      tree vectype, decl = vnode->decl;
--      tree t;
-+      tree decl = vnode->decl;
-       unsigned int alignment;
- 
--      t = TREE_TYPE (decl);
--      if (TREE_CODE (t) != ARRAY_TYPE)
--        continue;
--      vectype = get_vectype_for_scalar_type (strip_array_types (t));
--      if (!vectype)
--        continue;
--      alignment = TYPE_ALIGN (vectype);
--      if (DECL_ALIGN (decl) >= alignment)
--        continue;
--
--      if (vect_can_force_dr_alignment_p (decl, alignment))
-+      if ((decl_in_symtab_p (decl)
-+	  && !symtab_node::get (decl)->can_increase_alignment_p ())
-+	  || DECL_USER_ALIGN (decl) || DECL_ARTIFICIAL (decl))
-+	continue;
-+
-+      alignment = get_vec_alignment_for_type (TREE_TYPE (decl));
-+      if (alignment && vect_can_force_dr_alignment_p (decl, alignment))
-         {
--	  vnode->increase_alignment (TYPE_ALIGN (vectype));
-+	  vnode->increase_alignment (alignment);
-           dump_printf (MSG_NOTE, "Increasing alignment of decl: ");
-           dump_generic_expr (MSG_NOTE, TDF_SLIM, decl);
-           dump_printf (MSG_NOTE, "\n");
-         }
-     }
-+
-+  delete type_align_map;
-   return 0;
- }
- 
---- a/src/gcc/tree-vrp.c
-+++ b/src/gcc/tree-vrp.c
-@@ -3165,6 +3165,24 @@ extract_range_from_binary_expr_1 (value_range *vr,
- 	  if (int_cst_range1 && tree_int_cst_sgn (vr1.min) >= 0)
- 	    wmax = wi::min (wmax, vr1.max, TYPE_SIGN (expr_type));
- 	  max = wide_int_to_tree (expr_type, wmax);
-+	  cmp = compare_values (min, max);
-+	  /* PR68217: In case of signed & sign-bit-CST should
-+	     result in [-INF, 0] instead of [-INF, INF].  */
-+	  if (cmp == -2 || cmp == 1)
-+	    {
-+	      wide_int sign_bit
-+		= wi::set_bit_in_zero (TYPE_PRECISION (expr_type) - 1,
-+				       TYPE_PRECISION (expr_type));
-+	      if (!TYPE_UNSIGNED (expr_type)
-+		  && ((value_range_constant_singleton (&vr0)
-+		       && !wi::cmps (vr0.min, sign_bit))
-+		      || (value_range_constant_singleton (&vr1)
-+			  && !wi::cmps (vr1.min, sign_bit))))
-+		{
-+		  min = TYPE_MIN_VALUE (expr_type);
-+		  max = build_int_cst (expr_type, 0);
-+		}
-+	    }
- 	}
-       else if (code == BIT_IOR_EXPR)
- 	{
-@@ -3859,7 +3877,8 @@ extract_range_basic (value_range *vr, gimple *stmt)
- 	  arg = gimple_call_arg (stmt, 0);
- 	  if (TREE_CODE (arg) == SSA_NAME
- 	      && SSA_NAME_IS_DEFAULT_DEF (arg)
--	      && TREE_CODE (SSA_NAME_VAR (arg)) == PARM_DECL)
-+	      && TREE_CODE (SSA_NAME_VAR (arg)) == PARM_DECL
-+	      && cfun->after_inlining)
- 	    {
- 	      set_value_range_to_null (vr, type);
- 	      return;
-@@ -9935,6 +9954,40 @@ simplify_internal_call_using_ranges (gimple_stmt_iterator *gsi, gimple *stmt)
-   return true;
- }
- 
-+/* Return true if VAR is a two-valued variable.  Set a and b with the
-+   two-values when it is true.  Return false otherwise.  */
-+
-+static bool
-+two_valued_val_range_p (tree var, tree *a, tree *b)
-+{
-+  value_range *vr = get_value_range (var);
-+  if ((vr->type != VR_RANGE
-+       && vr->type != VR_ANTI_RANGE)
-+      || TREE_CODE (vr->min) != INTEGER_CST
-+      || TREE_CODE (vr->max) != INTEGER_CST)
-+    return false;
-+
-+  if (vr->type == VR_RANGE
-+      && wi::sub (vr->max, vr->min) == 1)
-+    {
-+      *a = vr->min;
-+      *b = vr->max;
-+      return true;
-+    }
-+
-+  /* ~[TYPE_MIN + 1, TYPE_MAX - 1] */
-+  if (vr->type == VR_ANTI_RANGE
-+      && wi::sub (vr->min, vrp_val_min (TREE_TYPE (var))) == 1
-+      && wi::sub (vrp_val_max (TREE_TYPE (var)), vr->max) == 1)
-+    {
-+      *a = vrp_val_min (TREE_TYPE (var));
-+      *b = vrp_val_max (TREE_TYPE (var));
-+      return true;
-+    }
-+
-+  return false;
-+}
-+
- /* Simplify STMT using ranges if possible.  */
- 
- static bool
-@@ -9945,6 +9998,68 @@ simplify_stmt_using_ranges (gimple_stmt_iterator *gsi)
-     {
-       enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
-       tree rhs1 = gimple_assign_rhs1 (stmt);
-+      tree rhs2 = gimple_assign_rhs2 (stmt);
-+      tree lhs = gimple_assign_lhs (stmt);
-+      tree val1 = NULL_TREE, val2 = NULL_TREE;
-+      use_operand_p use_p;
-+      gimple *use_stmt;
-+
-+      /* Convert:
-+	 LHS = CST BINOP VAR
-+	 Where VAR is two-valued and LHS is used in GIMPLE_COND only
-+	 To:
-+	 LHS = VAR == VAL1 ? (CST BINOP VAL1) : (CST BINOP VAL2)
-+
-+	 Also handles:
-+	 LHS = VAR BINOP CST
-+	 Where VAR is two-valued and LHS is used in GIMPLE_COND only
-+	 To:
-+	 LHS = VAR == VAL1 ? (VAL1 BINOP CST) : (VAL2 BINOP CST) */
-+
-+      if (TREE_CODE_CLASS (rhs_code) == tcc_binary
-+	  && INTEGRAL_TYPE_P (TREE_TYPE (lhs))
-+	  && ((TREE_CODE (rhs1) == INTEGER_CST
-+	       && TREE_CODE (rhs2) == SSA_NAME)
-+	      || (TREE_CODE (rhs2) == INTEGER_CST
-+		  && TREE_CODE (rhs1) == SSA_NAME))
-+	  && single_imm_use (lhs, &use_p, &use_stmt)
-+	  && gimple_code (use_stmt) == GIMPLE_COND)
-+
-+	{
-+	  tree new_rhs1 = NULL_TREE;
-+	  tree new_rhs2 = NULL_TREE;
-+	  tree cmp_var = NULL_TREE;
-+
-+	  if (TREE_CODE (rhs2) == SSA_NAME
-+	      && two_valued_val_range_p (rhs2, &val1, &val2))
-+	    {
-+	      /* Optimize RHS1 OP [VAL1, VAL2].  */
-+	      new_rhs1 = int_const_binop (rhs_code, rhs1, val1);
-+	      new_rhs2 = int_const_binop (rhs_code, rhs1, val2);
-+	      cmp_var = rhs2;
-+	    }
-+	  else if (TREE_CODE (rhs1) == SSA_NAME
-+		   && two_valued_val_range_p (rhs1, &val1, &val2))
-+	    {
-+	      /* Optimize [VAL1, VAL2] OP RHS2.  */
-+	      new_rhs1 = int_const_binop (rhs_code, val1, rhs2);
-+	      new_rhs2 = int_const_binop (rhs_code, val2, rhs2);
-+	      cmp_var = rhs1;
-+	    }
-+
-+	  /* If we could not find two-vals or the optimzation is invalid as
-+	     in divide by zero, new_rhs1 / new_rhs will be NULL_TREE.  */
-+	  if (new_rhs1 && new_rhs2)
-+	    {
-+	      tree cond = build2 (EQ_EXPR, TREE_TYPE (cmp_var), cmp_var, val1);
-+	      gimple_assign_set_rhs_with_ops (gsi,
-+					      COND_EXPR, cond,
-+					      new_rhs1,
-+					      new_rhs2);
-+	      update_stmt (gsi_stmt (*gsi));
-+	      return true;
-+	    }
-+	}
- 
-       switch (rhs_code)
- 	{
---- a/src/gcc/tree.h
-+++ b/src/gcc/tree.h
-@@ -4628,69 +4628,6 @@ extern void warn_deprecated_use (tree, tree);
- extern void cache_integer_cst (tree);
- extern const char *combined_fn_name (combined_fn);
- 
--/* Return the memory model from a host integer.  */
--static inline enum memmodel
--memmodel_from_int (unsigned HOST_WIDE_INT val)
--{
--  return (enum memmodel) (val & MEMMODEL_MASK);
--}
--
--/* Return the base memory model from a host integer.  */
--static inline enum memmodel
--memmodel_base (unsigned HOST_WIDE_INT val)
--{
--  return (enum memmodel) (val & MEMMODEL_BASE_MASK);
--}
--
--/* Return TRUE if the memory model is RELAXED.  */
--static inline bool
--is_mm_relaxed (enum memmodel model)
--{
--  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_RELAXED;
--}
--
--/* Return TRUE if the memory model is CONSUME.  */
--static inline bool
--is_mm_consume (enum memmodel model)
--{
--  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_CONSUME;
--}
--
--/* Return TRUE if the memory model is ACQUIRE.  */
--static inline bool
--is_mm_acquire (enum memmodel model)
--{
--  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_ACQUIRE;
--}
--
--/* Return TRUE if the memory model is RELEASE.  */
--static inline bool
--is_mm_release (enum memmodel model)
--{
--  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_RELEASE;
--}
--
--/* Return TRUE if the memory model is ACQ_REL.  */
--static inline bool
--is_mm_acq_rel (enum memmodel model)
--{
--  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_ACQ_REL;
--}
--
--/* Return TRUE if the memory model is SEQ_CST.  */
--static inline bool
--is_mm_seq_cst (enum memmodel model)
--{
--  return (model & MEMMODEL_BASE_MASK) == MEMMODEL_SEQ_CST;
--}
--
--/* Return TRUE if the memory model is a SYNC variant.  */
--static inline bool
--is_mm_sync (enum memmodel model)
--{
--  return (model & MEMMODEL_SYNC);
--}
--
- /* Compare and hash for any structure which begins with a canonical
-    pointer.  Assumes all pointers are interchangeable, which is sort
-    of already assumed by gcc elsewhere IIRC.  */
---- a/src/gcc/tsan.c
-+++ b/src/gcc/tsan.c
-@@ -25,6 +25,7 @@ along with GCC; see the file COPYING3.  If not see
- #include "backend.h"
- #include "rtl.h"
- #include "tree.h"
-+#include "memmodel.h"
- #include "gimple.h"
- #include "tree-pass.h"
- #include "ssa.h"
---- a/src/gcc/varasm.c
-+++ b/src/gcc/varasm.c
-@@ -6776,6 +6776,16 @@ default_use_anchors_for_symbol_p (const_rtx symbol)
- 	 sections that should be marked as small in the section directive.  */
-       if (targetm.in_small_data_p (decl))
- 	return false;
-+
-+      /* Don't use section anchors for decls that won't fit inside a single
-+	 anchor range to reduce the amount of instructions required to refer
-+	 to the entire declaration.  */
-+      if (DECL_SIZE_UNIT (decl) == NULL_TREE
-+	  || !tree_fits_uhwi_p (DECL_SIZE_UNIT (decl))
-+	  || (tree_to_uhwi (DECL_SIZE_UNIT (decl))
-+	      >= (unsigned HOST_WIDE_INT) targetm.max_anchor_offset))
-+	return false;
-+
-     }
-   return true;
- }
---- a/src/libcpp/expr.c
-+++ b/src/libcpp/expr.c
-@@ -1073,7 +1073,7 @@ eval_token (cpp_reader *pfile, const cpp_token *token,
- 	  result.low = 0;
- 	  if (CPP_OPTION (pfile, warn_undef) && !pfile->state.skip_eval)
- 	    cpp_warning_with_line (pfile, CPP_W_UNDEF, virtual_location, 0,
--				   "\"%s\" is not defined",
-+				   "\"%s\" is not defined, evaluates to 0",
- 				   NODE_NAME (token->val.node.node));
- 	}
-       break;
---- a/src/libcpp/lex.c
-+++ b/src/libcpp/lex.c
-@@ -750,6 +750,101 @@ search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
-   }
- }
- 
-+#elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
-+#include "arm_neon.h"
-+
-+/* This doesn't have to be the exact page size, but no system may use
-+   a size smaller than this.  ARMv8 requires a minimum page size of
-+   4k.  The impact of being conservative here is a small number of
-+   cases will take the slightly slower entry path into the main
-+   loop.  */
-+
-+#define AARCH64_MIN_PAGE_SIZE 4096
-+
-+static const uchar *
-+search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
-+{
-+  const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
-+  const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
-+  const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
-+  const uint8x16_t repl_qm = vdupq_n_u8 ('?');
-+  const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
-+
-+#ifdef __AARCH64EB
-+  const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
-+#else
-+  const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
-+#endif
-+
-+  unsigned int found;
-+  const uint8_t *p;
-+  uint8x16_t data;
-+  uint8x16_t t;
-+  uint16x8_t m;
-+  uint8x16_t u, v, w;
-+
-+  /* Align the source pointer.  */
-+  p = (const uint8_t *)((uintptr_t)s & -16);
-+
-+  /* Assuming random string start positions, with a 4k page size we'll take
-+     the slow path about 0.37% of the time.  */
-+  if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
-+			 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
-+			< 16, 0))
-+    {
-+      /* Slow path: the string starts near a possible page boundary.  */
-+      uint32_t misalign, mask;
-+
-+      misalign = (uintptr_t)s & 15;
-+      mask = (-1u << misalign) & 0xffff;
-+      data = vld1q_u8 (p);
-+      t = vceqq_u8 (data, repl_nl);
-+      u = vceqq_u8 (data, repl_cr);
-+      v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
-+      w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
-+      t = vorrq_u8 (v, w);
-+      t = vandq_u8 (t, xmask);
-+      m = vpaddlq_u8 (t);
-+      m = vshlq_u16 (m, shift);
-+      found = vaddvq_u16 (m);
-+      found &= mask;
-+      if (found)
-+	return (const uchar*)p + __builtin_ctz (found);
-+    }
-+  else
-+    {
-+      data = vld1q_u8 ((const uint8_t *) s);
-+      t = vceqq_u8 (data, repl_nl);
-+      u = vceqq_u8 (data, repl_cr);
-+      v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
-+      w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
-+      t = vorrq_u8 (v, w);
-+      if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t), 0))
-+	goto done;
-+    }
-+
-+  do
-+    {
-+      p += 16;
-+      data = vld1q_u8 (p);
-+      t = vceqq_u8 (data, repl_nl);
-+      u = vceqq_u8 (data, repl_cr);
-+      v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
-+      w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
-+      t = vorrq_u8 (v, w);
-+    } while (!vpaddd_u64 ((uint64x2_t)t));
-+
-+done:
-+  /* Now that we've found the terminating substring, work out precisely where
-+     we need to stop.  */
-+  t = vandq_u8 (t, xmask);
-+  m = vpaddlq_u8 (t);
-+  m = vshlq_u16 (m, shift);
-+  found = vaddvq_u16 (m);
-+  return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
-+	  + __builtin_ctz (found));
-+}
-+
- #elif defined (__ARM_NEON)
- #include "arm_neon.h"
- 
---- a/src/libgcc/Makefile.in
-+++ b/src/libgcc/Makefile.in
-@@ -414,8 +414,9 @@ lib2funcs = _muldi3 _negdi2 _lshrdi3 _ashldi3 _ashrdi3 _cmpdi2 _ucmpdi2	   \
- 	    _negvsi2 _negvdi2 _ctors _ffssi2 _ffsdi2 _clz _clzsi2 _clzdi2  \
- 	    _ctzsi2 _ctzdi2 _popcount_tab _popcountsi2 _popcountdi2	   \
- 	    _paritysi2 _paritydi2 _powisf2 _powidf2 _powixf2 _powitf2	   \
--	    _mulsc3 _muldc3 _mulxc3 _multc3 _divsc3 _divdc3 _divxc3	   \
--	    _divtc3 _bswapsi2 _bswapdi2 _clrsbsi2 _clrsbdi2
-+	    _mulhc3 _mulsc3 _muldc3 _mulxc3 _multc3 _divhc3 _divsc3	   \
-+	    _divdc3 _divxc3 _divtc3 _bswapsi2 _bswapdi2 _clrsbsi2	   \
-+	    _clrsbdi2
- 
- # The floating-point conversion routines that involve a single-word integer.
- # XX stands for the integer mode.
---- a/src/libgcc/config.host
-+++ b/src/libgcc/config.host
-@@ -1399,4 +1399,8 @@ i[34567]86-*-linux* | x86_64-*-linux*)
- 	fi
- 	tm_file="${tm_file} i386/value-unwind.h"
- 	;;
-+aarch64*-*-*)
-+	# ILP32 needs an extra header for unwinding
-+	tm_file="${tm_file} aarch64/value-unwind.h"
-+	;;
- esac
---- /dev/null
-+++ b/src/libgcc/config/aarch64/value-unwind.h
-@@ -0,0 +1,25 @@
-+/* Store register values as _Unwind_Word type in DWARF2 EH unwind context.
-+   Copyright (C) 2017 Free Software Foundation, Inc.
-+
-+   This file is part of GCC.
-+
-+   GCC is free software; you can redistribute it and/or modify it
-+   under the terms of the GNU General Public License as published
-+   by the Free Software Foundation; either version 3, or (at your
-+   option) any later version.
-+
-+   GCC is distributed in the hope that it will be useful, but WITHOUT
-+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-+   License for more details.
-+
-+   You should have received a copy of the GNU General Public License and
-+   a copy of the GCC Runtime Library Exception along with this program;
-+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+/* Define this macro if the target stores register values as _Unwind_Word
-+   type in unwind context.  Only enable it for ilp32.  */
-+#if defined __aarch64__ && !defined __LP64__
-+# define REG_VALUE_IN_UNWIND_CONTEXT
-+#endif
---- a/src/libgcc/config/arm/bpabi-v6m.S
-+++ b/src/libgcc/config/arm/bpabi-v6m.S
-@@ -1,4 +1,5 @@
--/* Miscellaneous BPABI functions.  ARMv6M implementation
-+/* Miscellaneous BPABI functions.  Thumb-1 implementation, suitable for ARMv4T,
-+   ARMv6-M and ARMv8-M Baseline like ISA variants.
- 
-    Copyright (C) 2006-2016 Free Software Foundation, Inc.
-    Contributed by CodeSourcery.
---- /dev/null
-+++ b/src/libgcc/config/arm/cmse.c
-@@ -0,0 +1,108 @@
-+/* ARMv8-M Security Extensions routines.
-+   Copyright (C) 2015-2016 Free Software Foundation, Inc.
-+   Contributed by ARM Ltd.
-+
-+   This file is free software; you can redistribute it and/or modify it
-+   under the terms of the GNU General Public License as published by the
-+   Free Software Foundation; either version 3, or (at your option) any
-+   later version.
-+
-+   This file is distributed in the hope that it will be useful, but
-+   WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   General Public License for more details.
-+
-+   Under Section 7 of GPL version 3, you are granted additional
-+   permissions described in the GCC Runtime Library Exception, version
-+   3.1, as published by the Free Software Foundation.
-+
-+   You should have received a copy of the GNU General Public License and
-+   a copy of the GCC Runtime Library Exception along with this program;
-+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+
-+#if __ARM_FEATURE_CMSE & 1
-+
-+#include <arm_cmse.h>
-+
-+/* ARM intrinsic function to perform a permission check on a given
-+   address range.  See ACLE changes for ARMv8-M.  */
-+
-+void *
-+cmse_check_address_range (void *p, size_t size, int flags)
-+{
-+  cmse_address_info_t permb, perme;
-+  char *pb = (char *) p, *pe;
-+
-+  /* Check if the range wraps around.  */
-+  if (UINTPTR_MAX - (uintptr_t) p < size)
-+    return NULL;
-+
-+  /* Check if an unknown flag is present.  */
-+  int known = CMSE_MPU_UNPRIV | CMSE_MPU_READWRITE | CMSE_MPU_READ;
-+  int known_secure_level = CMSE_MPU_UNPRIV;
-+#if __ARM_FEATURE_CMSE & 2
-+  known |= CMSE_AU_NONSECURE | CMSE_MPU_NONSECURE;
-+  known_secure_level |= CMSE_MPU_NONSECURE;
-+#endif
-+  if (flags & (~known))
-+    return NULL;
-+
-+  /* Execute the right variant of the TT instructions.  */
-+  pe = pb + size - 1;
-+  const int singleCheck = (((uintptr_t) pb ^ (uintptr_t) pe) < 32);
-+  switch (flags & known_secure_level)
-+    {
-+    case 0:
-+      permb = cmse_TT (pb);
-+      perme = singleCheck ? permb : cmse_TT (pe);
-+      break;
-+    case CMSE_MPU_UNPRIV:
-+      permb = cmse_TTT (pb);
-+      perme = singleCheck ? permb : cmse_TTT (pe);
-+      break;
-+#if __ARM_FEATURE_CMSE & 2
-+    case CMSE_MPU_NONSECURE:
-+      permb = cmse_TTA (pb);
-+      perme = singleCheck ? permb : cmse_TTA (pe);
-+      break;
-+    case CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE:
-+      permb = cmse_TTAT (pb);
-+      perme = singleCheck ? permb : cmse_TTAT (pe);
-+      break;
-+#endif
-+    default:
-+      /* Invalid flag, eg.  CMSE_MPU_NONSECURE specified but
-+	 __ARM_FEATURE_CMSE & 2 == 0.  */
-+      return NULL;
-+    }
-+
-+  /* Check that the range does not cross MPU, SAU, or IDAU boundaries.  */
-+  if (permb.value != perme.value)
-+    return NULL;
-+
-+  /* Check the permissions on the range.  */
-+  switch (flags & (~known_secure_level))
-+    {
-+#if __ARM_FEATURE_CMSE & 2
-+    case CMSE_MPU_READ | CMSE_MPU_READWRITE | CMSE_AU_NONSECURE:
-+    case		 CMSE_MPU_READWRITE | CMSE_AU_NONSECURE:
-+      return permb.flags.nonsecure_readwrite_ok	? p : NULL;
-+    case CMSE_MPU_READ | CMSE_AU_NONSECURE:
-+      return permb.flags.nonsecure_read_ok	? p : NULL;
-+    case CMSE_AU_NONSECURE:
-+      return permb.flags.secure			? NULL : p;
-+#endif
-+    case CMSE_MPU_READ | CMSE_MPU_READWRITE:
-+    case		 CMSE_MPU_READWRITE:
-+      return permb.flags.readwrite_ok		? p : NULL;
-+    case CMSE_MPU_READ:
-+      return permb.flags.read_ok		? p : NULL;
-+    default:
-+      return NULL;
-+    }
-+}
-+
-+
-+#endif /* __ARM_FEATURE_CMSE & 1.  */
---- /dev/null
-+++ b/src/libgcc/config/arm/cmse_nonsecure_call.S
-@@ -0,0 +1,131 @@
-+/* CMSE wrapper function used to save, clear and restore callee saved registers
-+   for cmse_nonsecure_call's.
-+
-+   Copyright (C) 2016 Free Software Foundation, Inc.
-+   Contributed by ARM Ltd.
-+
-+   This file is free software; you can redistribute it and/or modify it
-+   under the terms of the GNU General Public License as published by the
-+   Free Software Foundation; either version 3, or (at your option) any
-+   later version.
-+
-+   This file is distributed in the hope that it will be useful, but
-+   WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   General Public License for more details.
-+
-+   Under Section 7 of GPL version 3, you are granted additional
-+   permissions described in the GCC Runtime Library Exception, version
-+   3.1, as published by the Free Software Foundation.
-+
-+   You should have received a copy of the GNU General Public License and
-+   a copy of the GCC Runtime Library Exception along with this program;
-+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+.syntax unified
-+.thumb
-+.global __gnu_cmse_nonsecure_call
-+__gnu_cmse_nonsecure_call:
-+#if defined(__ARM_ARCH_8M_MAIN__)
-+push	    {r5-r11,lr}
-+mov	    r7, r4
-+mov	    r8, r4
-+mov	    r9, r4
-+mov	    r10, r4
-+mov	    r11, r4
-+mov	    ip, r4
-+
-+/* Save and clear callee-saved registers only if we are dealing with hard float
-+   ABI.  The unused caller-saved registers have already been cleared by GCC
-+   generated code.  */
-+#ifdef __ARM_PCS_VFP
-+vpush.f64   {d8-d15}
-+mov	    r5, #0
-+vmov	    d8, r5, r5
-+#if __ARM_FP & 0x04
-+vmov	    s18, s19, r5, r5
-+vmov	    s20, s21, r5, r5
-+vmov	    s22, s23, r5, r5
-+vmov	    s24, s25, r5, r5
-+vmov	    s26, s27, r5, r5
-+vmov	    s28, s29, r5, r5
-+vmov	    s30, s31, r5, r5
-+#elif __ARM_FP & 0x08
-+vmov.f64    d9, d8
-+vmov.f64    d10, d8
-+vmov.f64    d11, d8
-+vmov.f64    d12, d8
-+vmov.f64    d13, d8
-+vmov.f64    d14, d8
-+vmov.f64    d15, d8
-+#else
-+#error "Half precision implementation not supported."
-+#endif
-+/* Clear the cumulative exception-status bits (0-4,7) and the
-+   condition code bits (28-31) of the FPSCR.  */
-+vmrs	    r5, fpscr
-+movw	    r6, #65376
-+movt	    r6, #4095
-+ands	    r5, r6
-+vmsr	    fpscr, r5
-+
-+/* We are not dealing with hard float ABI, so we can safely use the vlstm and
-+   vlldm instructions without needing to preserve the registers used for
-+   argument passing.  */
-+#else
-+sub	    sp, sp, #0x88 /* Reserve stack space to save all floating point
-+			     registers, including FPSCR.  */
-+vlstm	    sp		  /* Lazy store and clearance of d0-d16 and FPSCR.  */
-+#endif /* __ARM_PCS_VFP */
-+
-+/* Make sure to clear the 'GE' bits of the APSR register if 32-bit SIMD
-+   instructions are available.  */
-+#if defined(__ARM_FEATURE_SIMD32)
-+msr	    APSR_nzcvqg, r4
-+#else
-+msr	    APSR_nzcvq, r4
-+#endif
-+
-+mov	    r5, r4
-+mov	    r6, r4
-+blxns	    r4
-+
-+#ifdef __ARM_PCS_VFP
-+vpop.f64    {d8-d15}
-+#else
-+vlldm	    sp		  /* Lazy restore of d0-d16 and FPSCR.  */
-+add	    sp, sp, #0x88 /* Free space used to save floating point registers.  */
-+#endif /* __ARM_PCS_VFP */
-+
-+pop	    {r5-r11, pc}
-+
-+#elif defined (__ARM_ARCH_8M_BASE__)
-+push	    {r5-r7, lr}
-+mov	    r5, r8
-+mov	    r6, r9
-+mov	    r7, r10
-+push	    {r5-r7}
-+mov	    r5, r11
-+push	    {r5}
-+mov	    r5, r4
-+mov	    r6, r4
-+mov	    r7, r4
-+mov	    r8, r4
-+mov	    r9, r4
-+mov	    r10, r4
-+mov	    r11, r4
-+mov	    ip, r4
-+msr	    APSR_nzcvq, r4
-+blxns	    r4
-+pop	    {r5}
-+mov	    r11, r5
-+pop	    {r5-r7}
-+mov	    r10, r7
-+mov	    r9, r6
-+mov	    r8, r5
-+pop	    {r5-r7, pc}
-+
-+#else
-+#error "This should only be used for armv8-m base- and mainline."
-+#endif
---- a/src/libgcc/config/arm/ieee754-df.S
-+++ b/src/libgcc/config/arm/ieee754-df.S
-@@ -160,8 +160,8 @@ ARM_FUNC_ALIAS aeabi_dadd adddf3
- 	teq	r4, r5
- 	beq	LSYM(Lad_d)
- 
--@ CFI note: we're lucky that the branches to Lad_* that appear after this function
--@ have a CFI state that's exactly the same as the one we're in at this
-+@ CFI note: we're lucky that the branches to Lad_* that appear after this
-+@ function have a CFI state that's exactly the same as the one we're in at this
- @ point. Otherwise the CFI would change to a different state after the branch,
- @ which would be disastrous for backtracing.
- LSYM(Lad_x):
-@@ -507,11 +507,15 @@ ARM_FUNC_ALIAS aeabi_f2d extendsfdf2
- 	eorne	xh, xh, #0x38000000	@ fixup exponent otherwise.
- 	RETc(ne)			@ and return it.
- 
--	teq	r2, #0			@ if actually 0
--	do_it	ne, e
--	teqne	r3, #0xff000000		@ or INF or NAN
-+	bics	r2, r2, #0xff000000	@ isolate mantissa
-+	do_it	eq			@ if 0, that is ZERO or INF,
- 	RETc(eq)			@ we are done already.
- 
-+	teq	r3, #0xff000000		@ check for NAN
-+	do_it	eq, t
-+	orreq	xh, xh, #0x00080000	@ change to quiet NAN
-+	RETc(eq)			@ and return it.
-+
- 	@ value was denormalized.  We can normalize it now.
- 	do_push	{r4, r5, lr}
- 	.cfi_adjust_cfa_offset 12   @ CFA is now sp + previousOffset + 12
-@@ -1158,8 +1162,8 @@ ARM_FUNC_ALIAS eqdf2 cmpdf2
- 1:	str	ip, [sp, #-4]!
- 	.cfi_adjust_cfa_offset 4        @ CFA is now sp + previousOffset + 4.
- 	@ We're not adding CFI for ip as it's pushed into the stack
--	@ only because @ it may be popped off later as a return value
--	@ (i.e. we're not preserving @ it anyways).
-+	@ only because it may be popped off later as a return value
-+	@ (i.e. we're not preserving it anyways).
- 
- 	@ Trap any INF/NAN first.
- 	mov	ip, xh, lsl #1
-@@ -1169,14 +1173,14 @@ ARM_FUNC_ALIAS eqdf2 cmpdf2
- 	COND(mvn,s,ne)	ip, ip, asr #21
- 	beq	3f
- 	.cfi_remember_state
--	@ Save the current CFI state. This is done because the branch
--	@ is conditional, @ and if we don't take it we'll issue a
--	@ .cfi_adjust_cfa_offset and return.  @ If we do take it,
--	@ however, the .cfi_adjust_cfa_offset from the non-branch @ code
--	@ will affect the branch code as well. To avoid this we'll
--	@ restore @ the current state before executing the branch code.
--
--	@ Test for equality.  @ Note that 0.0 is equal to -0.0.
-+	@ Save the current CFI state.  This is done because the branch
-+	@ is conditional, and if we don't take it we'll issue a
-+	@ .cfi_adjust_cfa_offset and return.  If we do take it,
-+	@ however, the .cfi_adjust_cfa_offset from the non-branch code
-+	@ will affect the branch code as well.  To avoid this we'll
-+	@ restore the current state before executing the branch code.
-+
-+	@ Test for equality.  Note that 0.0 is equal to -0.0.
- 2:	add	sp, sp, #4
- 	.cfi_adjust_cfa_offset -4       @ CFA is now sp + previousOffset.
- 
---- a/src/libgcc/config/arm/lib1funcs.S
-+++ b/src/libgcc/config/arm/lib1funcs.S
-@@ -108,7 +108,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
- # define __ARM_ARCH__ 7
- #endif
- 
--#if defined(__ARM_ARCH_8A__)
-+#if defined(__ARM_ARCH_8A__) || defined(__ARM_ARCH_8M_BASE__) \
-+	|| defined(__ARM_ARCH_8M_MAIN__)
- # define __ARM_ARCH__ 8
- #endif
- 
-@@ -124,10 +125,14 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-      && !defined(__thumb2__)		\
-      && (!defined(__THUMB_INTERWORK__)	\
- 	 || defined (__OPTIMIZE_SIZE__)	\
--	 || defined(__ARM_ARCH_6M__)))
-+	 || !__ARM_ARCH_ISA_ARM))
- # define __prefer_thumb__
- #endif
- 
-+#if !__ARM_ARCH_ISA_ARM && __ARM_ARCH_ISA_THUMB == 1
-+#define NOT_ISA_TARGET_32BIT 1
-+#endif
-+
- /* How to return from a function call depends on the architecture variant.  */
- 
- #if (__ARM_ARCH__ > 4) || defined(__ARM_ARCH_4T__)
-@@ -305,35 +310,14 @@ LSYM(Lend_fde):
- 
- #ifdef __ARM_EABI__
- .macro THUMB_LDIV0 name signed
--#if defined(__ARM_ARCH_6M__)
--	.ifc \signed, unsigned
--	cmp	r0, #0
--	beq	1f
--	mov	r0, #0
--	mvn	r0, r0		@ 0xffffffff
--1:
--	.else
--	cmp	r0, #0
--	beq	2f
--	blt	3f
-+#ifdef NOT_ISA_TARGET_32BIT
-+
-+	push	{r0, lr}
- 	mov	r0, #0
--	mvn	r0, r0
--	lsr	r0, r0, #1	@ 0x7fffffff
--	b	2f
--3:	mov	r0, #0x80
--	lsl	r0, r0, #24	@ 0x80000000
--2:
--	.endif
--	push	{r0, r1, r2}
--	ldr	r0, 4f
--	adr	r1, 4f
--	add	r0, r1
--	str	r0, [sp, #8]
-+	bl	SYM(__aeabi_idiv0)
- 	@ We know we are not on armv4t, so pop pc is safe.
--	pop	{r0, r1, pc}
--	.align	2
--4:
--	.word	__aeabi_idiv0 - 4b
-+	pop	{r1, pc}
-+
- #elif defined(__thumb2__)
- 	.syntax unified
- 	.ifc \signed, unsigned
-@@ -478,7 +462,7 @@ _L__\name:
- 
- #else /* !(__INTERWORKING_STUBS__ || __thumb2__) */
- 
--#ifdef __ARM_ARCH_6M__
-+#ifdef NOT_ISA_TARGET_32BIT
- #define EQUIV .thumb_set
- #else
- .macro	ARM_FUNC_START name sp_section=
-@@ -510,7 +494,7 @@ SYM (__\name):
- #endif
- .endm
- 
--#ifndef __ARM_ARCH_6M__
-+#ifndef NOT_ISA_TARGET_32BIT
- .macro	ARM_FUNC_ALIAS new old
- 	.globl	SYM (__\new)
- 	EQUIV	SYM (__\new), SYM (__\old)
-@@ -945,7 +929,170 @@ LSYM(Lover7):
- 	add	dividend, work
-   .endif
- LSYM(Lgot_result):
--.endm	
-+.endm
-+
-+/* If performance is preferred, the following functions are provided.  */
-+#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
-+
-+/* Branch to div(n), and jump to label if curbit is lo than divisior.  */
-+.macro BranchToDiv n, label
-+	lsr	curbit, dividend, \n
-+	cmp	curbit, divisor
-+	blo	\label
-+.endm
-+
-+/* Body of div(n).  Shift the divisor in n bits and compare the divisor
-+   and dividend.  Update the dividend as the substruction result.  */
-+.macro DoDiv n
-+	lsr	curbit, dividend, \n
-+	cmp	curbit, divisor
-+	bcc	1f
-+	lsl	curbit, divisor, \n
-+	sub	dividend, dividend, curbit
-+
-+1:	adc	result, result
-+.endm
-+
-+/* The body of division with positive divisor.  Unless the divisor is very
-+   big, shift it up in multiples of four bits, since this is the amount of
-+   unwinding in the main division loop.  Continue shifting until the divisor
-+   is larger than the dividend.  */
-+.macro THUMB1_Div_Positive
-+	mov	result, #0
-+	BranchToDiv #1, LSYM(Lthumb1_div1)
-+	BranchToDiv #4, LSYM(Lthumb1_div4)
-+	BranchToDiv #8, LSYM(Lthumb1_div8)
-+	BranchToDiv #12, LSYM(Lthumb1_div12)
-+	BranchToDiv #16, LSYM(Lthumb1_div16)
-+LSYM(Lthumb1_div_large_positive):
-+	mov	result, #0xff
-+	lsl	divisor, divisor, #8
-+	rev	result, result
-+	lsr	curbit, dividend, #16
-+	cmp	curbit, divisor
-+	blo	1f
-+	asr	result, #8
-+	lsl	divisor, divisor, #8
-+	beq	LSYM(Ldivbyzero_waypoint)
-+
-+1:	lsr	curbit, dividend, #12
-+	cmp	curbit, divisor
-+	blo	LSYM(Lthumb1_div12)
-+	b	LSYM(Lthumb1_div16)
-+LSYM(Lthumb1_div_loop):
-+	lsr	divisor, divisor, #8
-+LSYM(Lthumb1_div16):
-+	Dodiv	#15
-+	Dodiv	#14
-+	Dodiv	#13
-+	Dodiv	#12
-+LSYM(Lthumb1_div12):
-+	Dodiv	#11
-+	Dodiv	#10
-+	Dodiv	#9
-+	Dodiv	#8
-+	bcs	LSYM(Lthumb1_div_loop)
-+LSYM(Lthumb1_div8):
-+	Dodiv	#7
-+	Dodiv	#6
-+	Dodiv	#5
-+LSYM(Lthumb1_div5):
-+	Dodiv	#4
-+LSYM(Lthumb1_div4):
-+	Dodiv	#3
-+LSYM(Lthumb1_div3):
-+	Dodiv	#2
-+LSYM(Lthumb1_div2):
-+	Dodiv	#1
-+LSYM(Lthumb1_div1):
-+	sub	divisor, dividend, divisor
-+	bcs	1f
-+	cpy	divisor, dividend
-+
-+1:	adc	result, result
-+	cpy	dividend, result
-+	RET
-+
-+LSYM(Ldivbyzero_waypoint):
-+	b	LSYM(Ldiv0)
-+.endm
-+
-+/* The body of division with negative divisor.  Similar with
-+   THUMB1_Div_Positive except that the shift steps are in multiples
-+   of six bits.  */
-+.macro THUMB1_Div_Negative
-+	lsr	result, divisor, #31
-+	beq	1f
-+	neg	divisor, divisor
-+
-+1:	asr	curbit, dividend, #32
-+	bcc	2f
-+	neg	dividend, dividend
-+
-+2:	eor	curbit, result
-+	mov	result, #0
-+	cpy	ip, curbit
-+	BranchToDiv #4, LSYM(Lthumb1_div_negative4)
-+	BranchToDiv #8, LSYM(Lthumb1_div_negative8)
-+LSYM(Lthumb1_div_large):
-+	mov	result, #0xfc
-+	lsl	divisor, divisor, #6
-+	rev	result, result
-+	lsr	curbit, dividend, #8
-+	cmp	curbit, divisor
-+	blo	LSYM(Lthumb1_div_negative8)
-+
-+	lsl	divisor, divisor, #6
-+	asr	result, result, #6
-+	cmp	curbit, divisor
-+	blo	LSYM(Lthumb1_div_negative8)
-+
-+	lsl	divisor, divisor, #6
-+	asr	result, result, #6
-+	cmp	curbit, divisor
-+	blo	LSYM(Lthumb1_div_negative8)
-+
-+	lsl	divisor, divisor, #6
-+	beq	LSYM(Ldivbyzero_negative)
-+	asr	result, result, #6
-+	b	LSYM(Lthumb1_div_negative8)
-+LSYM(Lthumb1_div_negative_loop):
-+	lsr	divisor, divisor, #6
-+LSYM(Lthumb1_div_negative8):
-+	DoDiv	#7
-+	DoDiv	#6
-+	DoDiv	#5
-+	DoDiv	#4
-+LSYM(Lthumb1_div_negative4):
-+	DoDiv	#3
-+	DoDiv	#2
-+	bcs	LSYM(Lthumb1_div_negative_loop)
-+	DoDiv	#1
-+	sub	divisor, dividend, divisor
-+	bcs	1f
-+	cpy	divisor, dividend
-+
-+1:	cpy	curbit, ip
-+	adc	result, result
-+	asr	curbit, curbit, #1
-+	cpy	dividend, result
-+	bcc	2f
-+	neg	dividend, dividend
-+	cmp	curbit, #0
-+
-+2:	bpl	3f
-+	neg	divisor, divisor
-+
-+3:	RET
-+
-+LSYM(Ldivbyzero_negative):
-+	cpy	curbit, ip
-+	asr	curbit, curbit, #1
-+	bcc	LSYM(Ldiv0)
-+	neg	dividend, dividend
-+.endm
-+#endif /* ARM Thumb version.  */
-+
- /* ------------------------------------------------------------------------ */
- /*		Start of the Real Functions				    */
- /* ------------------------------------------------------------------------ */
-@@ -955,6 +1102,7 @@ LSYM(Lgot_result):
- 
- 	FUNC_START udivsi3
- 	FUNC_ALIAS aeabi_uidiv udivsi3
-+#if defined(__OPTIMIZE_SIZE__)
- 
- 	cmp	divisor, #0
- 	beq	LSYM(Ldiv0)
-@@ -972,6 +1120,14 @@ LSYM(udivsi3_skip_div0_test):
- 	pop	{ work }
- 	RET
- 
-+/* Implementation of aeabi_uidiv for ARMv6m.  This version is only
-+   used in ARMv6-M when we need an efficient implementation.  */
-+#else
-+LSYM(udivsi3_skip_div0_test):
-+	THUMB1_Div_Positive
-+
-+#endif /* __OPTIMIZE_SIZE__ */
-+
- #elif defined(__ARM_ARCH_EXT_IDIV__)
- 
- 	ARM_FUNC_START udivsi3
-@@ -1023,12 +1179,21 @@ LSYM(udivsi3_skip_div0_test):
- FUNC_START aeabi_uidivmod
- 	cmp	r1, #0
- 	beq	LSYM(Ldiv0)
-+# if defined(__OPTIMIZE_SIZE__)
- 	push	{r0, r1, lr}
- 	bl	LSYM(udivsi3_skip_div0_test)
- 	POP	{r1, r2, r3}
- 	mul	r2, r0
- 	sub	r1, r1, r2
- 	bx	r3
-+# else
-+	/* Both the quotient and remainder are calculated simultaneously
-+	   in THUMB1_Div_Positive.  There is no need to calculate the
-+	   remainder again here.  */
-+	b	LSYM(udivsi3_skip_div0_test)
-+	RET
-+# endif /* __OPTIMIZE_SIZE__ */
-+
- #elif defined(__ARM_ARCH_EXT_IDIV__)
- ARM_FUNC_START aeabi_uidivmod
- 	cmp	r1, #0
-@@ -1054,7 +1219,7 @@ ARM_FUNC_START aeabi_uidivmod
- /* ------------------------------------------------------------------------ */
- #ifdef L_umodsi3
- 
--#ifdef __ARM_ARCH_EXT_IDIV__
-+#if defined(__ARM_ARCH_EXT_IDIV__) && __ARM_ARCH_ISA_THUMB != 1
- 
- 	ARM_FUNC_START umodsi3
- 
-@@ -1084,7 +1249,7 @@ LSYM(Lover10):
- 	RET
- 	
- #else  /* ARM version.  */
--	
-+
- 	FUNC_START umodsi3
- 
- 	subs	r2, r1, #1			@ compare divisor with 1
-@@ -1109,8 +1274,9 @@ LSYM(Lover10):
- 
- #if defined(__prefer_thumb__)
- 
--	FUNC_START divsi3	
-+	FUNC_START divsi3
- 	FUNC_ALIAS aeabi_idiv divsi3
-+#if defined(__OPTIMIZE_SIZE__)
- 
- 	cmp	divisor, #0
- 	beq	LSYM(Ldiv0)
-@@ -1133,7 +1299,7 @@ LSYM(Lover11):
- 	blo	LSYM(Lgot_result)
- 
- 	THUMB_DIV_MOD_BODY 0
--	
-+
- 	mov	r0, result
- 	mov	work, ip
- 	cmp	work, #0
-@@ -1143,6 +1309,22 @@ LSYM(Lover12):
- 	pop	{ work }
- 	RET
- 
-+/* Implementation of aeabi_idiv for ARMv6m.  This version is only
-+   used in ARMv6-M when we need an efficient implementation.  */
-+#else
-+LSYM(divsi3_skip_div0_test):
-+	cpy	curbit, dividend
-+	orr	curbit, divisor
-+	bmi	LSYM(Lthumb1_div_negative)
-+
-+LSYM(Lthumb1_div_positive):
-+	THUMB1_Div_Positive
-+
-+LSYM(Lthumb1_div_negative):
-+	THUMB1_Div_Negative
-+
-+#endif /* __OPTIMIZE_SIZE__ */
-+
- #elif defined(__ARM_ARCH_EXT_IDIV__)
- 
- 	ARM_FUNC_START divsi3
-@@ -1154,8 +1336,8 @@ LSYM(Lover12):
- 	RET
- 
- #else /* ARM/Thumb-2 version.  */
--	
--	ARM_FUNC_START divsi3	
-+
-+	ARM_FUNC_START divsi3
- 	ARM_FUNC_ALIAS aeabi_idiv divsi3
- 
- 	cmp	r1, #0
-@@ -1209,12 +1391,21 @@ LSYM(divsi3_skip_div0_test):
- FUNC_START aeabi_idivmod
- 	cmp	r1, #0
- 	beq	LSYM(Ldiv0)
-+# if defined(__OPTIMIZE_SIZE__)
- 	push	{r0, r1, lr}
- 	bl	LSYM(divsi3_skip_div0_test)
- 	POP	{r1, r2, r3}
- 	mul	r2, r0
- 	sub	r1, r1, r2
- 	bx	r3
-+# else
-+	/* Both the quotient and remainder are calculated simultaneously
-+	   in THUMB1_Div_Positive and THUMB1_Div_Negative.  There is no
-+	   need to calculate the remainder again here.  */
-+	b	LSYM(divsi3_skip_div0_test)
-+	RET
-+# endif /* __OPTIMIZE_SIZE__ */
-+
- #elif defined(__ARM_ARCH_EXT_IDIV__)
- ARM_FUNC_START aeabi_idivmod
- 	cmp 	r1, #0
-@@ -1240,7 +1431,7 @@ ARM_FUNC_START aeabi_idivmod
- /* ------------------------------------------------------------------------ */
- #ifdef L_modsi3
- 
--#if defined(__ARM_ARCH_EXT_IDIV__)
-+#if defined(__ARM_ARCH_EXT_IDIV__) && __ARM_ARCH_ISA_THUMB != 1
- 
- 	ARM_FUNC_START modsi3
- 
-@@ -1508,14 +1699,15 @@ LSYM(Lover12):
- 
- #endif /* __symbian__ */
- 
--#if ((__ARM_ARCH__ > 5) && !defined(__ARM_ARCH_6M__)) \
--    || defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \
--    || defined(__ARM_ARCH_5TEJ__)
-+#if (__ARM_ARCH_ISA_THUMB == 2	\
-+     || (__ARM_ARCH_ISA_ARM	\
-+	 && (__ARM_ARCH__ > 5	\
-+	     || (__ARM_ARCH__ == 5 && __ARM_ARCH_ISA_THUMB))))
- #define HAVE_ARM_CLZ 1
- #endif
- 
- #ifdef L_clzsi2
--#if defined(__ARM_ARCH_6M__)
-+#ifdef NOT_ISA_TARGET_32BIT
- FUNC_START clzsi2
- 	mov	r1, #28
- 	mov	r3, #1
-@@ -1576,7 +1768,7 @@ ARM_FUNC_START clzsi2
- #ifdef L_clzdi2
- #if !defined(HAVE_ARM_CLZ)
- 
--# if defined(__ARM_ARCH_6M__)
-+# ifdef NOT_ISA_TARGET_32BIT
- FUNC_START clzdi2
- 	push	{r4, lr}
- # else
-@@ -1601,7 +1793,7 @@ ARM_FUNC_START clzdi2
- 	bl	__clzsi2
- # endif
- 2:
--# if defined(__ARM_ARCH_6M__)
-+# ifdef NOT_ISA_TARGET_32BIT
- 	pop	{r4, pc}
- # else
- 	RETLDM	r4
-@@ -1623,7 +1815,7 @@ ARM_FUNC_START clzdi2
- #endif /* L_clzdi2 */
- 
- #ifdef L_ctzsi2
--#if defined(__ARM_ARCH_6M__)
-+#ifdef NOT_ISA_TARGET_32BIT
- FUNC_START ctzsi2
- 	neg	r1, r0
- 	and	r0, r0, r1
-@@ -1738,7 +1930,7 @@ ARM_FUNC_START ctzsi2
- 
- /* Don't bother with the old interworking routines for Thumb-2.  */
- /* ??? Maybe only omit these on "m" variants.  */
--#if !defined(__thumb2__) && !defined(__ARM_ARCH_6M__)
-+#if !defined(__thumb2__) && __ARM_ARCH_ISA_ARM
- 
- #if defined L_interwork_call_via_rX
- 
-@@ -1983,11 +2175,12 @@ LSYM(Lchange_\register):
- .endm
- 
- #ifndef __symbian__
--#ifndef __ARM_ARCH_6M__
-+/* The condition here must match the one in gcc/config/arm/elf.h.  */
-+#ifndef NOT_ISA_TARGET_32BIT
- #include "ieee754-df.S"
- #include "ieee754-sf.S"
- #include "bpabi.S"
--#else /* __ARM_ARCH_6M__ */
-+#else /* NOT_ISA_TARGET_32BIT */
- #include "bpabi-v6m.S"
--#endif /* __ARM_ARCH_6M__ */
-+#endif /* NOT_ISA_TARGET_32BIT */
- #endif /* !__symbian__ */
---- a/src/libgcc/config/arm/libunwind.S
-+++ b/src/libgcc/config/arm/libunwind.S
-@@ -58,7 +58,7 @@
- #endif
- #endif
- 
--#ifdef __ARM_ARCH_6M__
-+#if !__ARM_ARCH_ISA_ARM && __ARM_ARCH_ISA_THUMB == 1
- 
- /* r0 points to a 16-word block.  Upload these values to the actual core
-    state.  */
-@@ -169,7 +169,7 @@ FUNC_START gnu_Unwind_Save_WMMXC
- 	UNPREFIX \name
- .endm
- 
--#else /* !__ARM_ARCH_6M__ */
-+#else /* __ARM_ARCH_ISA_ARM || __ARM_ARCH_ISA_THUMB != 1 */
- 
- /* r0 points to a 16-word block.  Upload these values to the actual core
-    state.  */
-@@ -351,7 +351,7 @@ ARM_FUNC_START gnu_Unwind_Save_WMMXC
- 	UNPREFIX \name
- .endm
- 
--#endif /* !__ARM_ARCH_6M__ */
-+#endif /* __ARM_ARCH_ISA_ARM || __ARM_ARCH_ISA_THUMB != 1 */
- 
- UNWIND_WRAPPER _Unwind_RaiseException 1
- UNWIND_WRAPPER _Unwind_Resume 1
---- a/src/libgcc/config/arm/t-arm
-+++ b/src/libgcc/config/arm/t-arm
-@@ -1,3 +1,17 @@
- LIB1ASMSRC = arm/lib1funcs.S
- LIB1ASMFUNCS = _thumb1_case_sqi _thumb1_case_uqi _thumb1_case_shi \
- 	_thumb1_case_uhi _thumb1_case_si
-+
-+HAVE_CMSE:=$(findstring __ARM_FEATURE_CMSE,$(shell $(gcc_compile_bare) -dM -E - </dev/null))
-+ifneq ($(shell $(gcc_compile_bare) -E -mcmse - </dev/null 2>/dev/null),)
-+CMSE_OPTS:=-mcmse
-+endif
-+
-+ifdef HAVE_CMSE
-+libgcc-objects += cmse.o cmse_nonsecure_call.o
-+
-+cmse.o: $(srcdir)/config/arm/cmse.c
-+	$(gcc_compile) -c $(CMSE_OPTS) $<
-+cmse_nonsecure_call.o: $(srcdir)/config/arm/cmse_nonsecure_call.S
-+		       $(gcc_compile) -c $<
-+endif
---- a/src/libgcc/config/arm/t-softfp
-+++ b/src/libgcc/config/arm/t-softfp
-@@ -1,2 +1,2 @@
--softfp_wrap_start := '\#ifdef __ARM_ARCH_6M__'
-+softfp_wrap_start := '\#if !__ARM_ARCH_ISA_ARM && __ARM_ARCH_ISA_THUMB == 1'
- softfp_wrap_end := '\#endif'
---- a/src/libgcc/libgcc2.c
-+++ b/src/libgcc/libgcc2.c
-@@ -1852,7 +1852,8 @@ NAME (TYPE x, int m)
- 
- #endif
- 
--#if ((defined(L_mulsc3) || defined(L_divsc3)) && LIBGCC2_HAS_SF_MODE) \
-+#if((defined(L_mulhc3) || defined(L_divhc3)) && LIBGCC2_HAS_HF_MODE) \
-+    || ((defined(L_mulsc3) || defined(L_divsc3)) && LIBGCC2_HAS_SF_MODE) \
-     || ((defined(L_muldc3) || defined(L_divdc3)) && LIBGCC2_HAS_DF_MODE) \
-     || ((defined(L_mulxc3) || defined(L_divxc3)) && LIBGCC2_HAS_XF_MODE) \
-     || ((defined(L_multc3) || defined(L_divtc3)) && LIBGCC2_HAS_TF_MODE)
-@@ -1861,7 +1862,13 @@ NAME (TYPE x, int m)
- #undef double
- #undef long
- 
--#if defined(L_mulsc3) || defined(L_divsc3)
-+#if defined(L_mulhc3) || defined(L_divhc3)
-+# define MTYPE	HFtype
-+# define CTYPE	HCtype
-+# define MODE	hc
-+# define CEXT	__LIBGCC_HF_FUNC_EXT__
-+# define NOTRUNC (!__LIBGCC_HF_EXCESS_PRECISION__)
-+#elif defined(L_mulsc3) || defined(L_divsc3)
- # define MTYPE	SFtype
- # define CTYPE	SCtype
- # define MODE	sc
-@@ -1922,7 +1929,7 @@ extern void *compile_type_assert[sizeof(INFINITY) == sizeof(MTYPE) ? 1 : -1];
- # define TRUNC(x)	__asm__ ("" : "=m"(x) : "m"(x))
- #endif
- 
--#if defined(L_mulsc3) || defined(L_muldc3) \
-+#if defined(L_mulhc3) || defined(L_mulsc3) || defined(L_muldc3) \
-     || defined(L_mulxc3) || defined(L_multc3)
- 
- CTYPE
-@@ -1992,7 +1999,7 @@ CONCAT3(__mul,MODE,3) (MTYPE a, MTYPE b, MTYPE c, MTYPE d)
- }
- #endif /* complex multiply */
- 
--#if defined(L_divsc3) || defined(L_divdc3) \
-+#if defined(L_divhc3) || defined(L_divsc3) || defined(L_divdc3) \
-     || defined(L_divxc3) || defined(L_divtc3)
- 
- CTYPE
---- a/src/libgcc/libgcc2.h
-+++ b/src/libgcc/libgcc2.h
-@@ -34,6 +34,12 @@ extern void __clear_cache (char *, char *);
- extern void __eprintf (const char *, const char *, unsigned int, const char *)
-   __attribute__ ((__noreturn__));
- 
-+#ifdef __LIBGCC_HAS_HF_MODE__
-+#define LIBGCC2_HAS_HF_MODE 1
-+#else
-+#define LIBGCC2_HAS_HF_MODE 0
-+#endif
-+
- #ifdef __LIBGCC_HAS_SF_MODE__
- #define LIBGCC2_HAS_SF_MODE 1
- #else
-@@ -133,6 +139,10 @@ typedef unsigned int UTItype	__attribute__ ((mode (TI)));
- #endif
- #endif
- 
-+#if LIBGCC2_HAS_HF_MODE
-+typedef		float HFtype	__attribute__ ((mode (HF)));
-+typedef _Complex float HCtype	__attribute__ ((mode (HC)));
-+#endif
- #if LIBGCC2_HAS_SF_MODE
- typedef 	float SFtype	__attribute__ ((mode (SF)));
- typedef _Complex float SCtype	__attribute__ ((mode (SC)));
-@@ -424,6 +434,10 @@ extern SItype __negvsi2 (SItype);
- #endif /* COMPAT_SIMODE_TRAPPING_ARITHMETIC */
- 
- #undef int
-+#if LIBGCC2_HAS_HF_MODE
-+extern HCtype __divhc3 (HFtype, HFtype, HFtype, HFtype);
-+extern HCtype __mulhc3 (HFtype, HFtype, HFtype, HFtype);
-+#endif
- #if LIBGCC2_HAS_SF_MODE
- extern DWtype __fixsfdi (SFtype);
- extern SFtype __floatdisf (DWtype);
---- a/src/libstdc++-v3/acinclude.m4
-+++ b/src/libstdc++-v3/acinclude.m4
-@@ -632,10 +632,10 @@ dnl  baseline_dir
- dnl  baseline_subdir_switch
- dnl
- AC_DEFUN([GLIBCXX_CONFIGURE_TESTSUITE], [
--  if $GLIBCXX_IS_NATIVE ; then
--    # Do checks for resource limit functions.
--    GLIBCXX_CHECK_SETRLIMIT
-+  # Do checks for resource limit functions.
-+  GLIBCXX_CHECK_SETRLIMIT
- 
-+  if $GLIBCXX_IS_NATIVE ; then
-     # Look for setenv, so that extended locale tests can be performed.
-     GLIBCXX_CHECK_STDLIB_DECL_AND_LINKAGE_3(setenv)
-   fi
---- a/src/libstdc++-v3/configure
-+++ b/src/libstdc++-v3/configure
-@@ -79519,8 +79519,7 @@ $as_echo "$ac_cv_x86_rdrand" >&6; }
- 
- # This depends on GLIBCXX_ENABLE_SYMVERS and GLIBCXX_IS_NATIVE.
- 
--  if $GLIBCXX_IS_NATIVE ; then
--    # Do checks for resource limit functions.
-+  # Do checks for resource limit functions.
- 
-   setrlimit_have_headers=yes
-   for ac_header in unistd.h sys/time.h sys/resource.h
-@@ -79749,6 +79748,7 @@ $as_echo "#define _GLIBCXX_RES_LIMITS 1" >>confdefs.h
- $as_echo "$ac_res_limits" >&6; }
- 
- 
-+  if $GLIBCXX_IS_NATIVE ; then
-     # Look for setenv, so that extended locale tests can be performed.
- 
-   { $as_echo "$as_me:${as_lineno-$LINENO}: checking for setenv declaration" >&5
---- a/src/libstdc++-v3/testsuite/29_atomics/atomic/65913.cc
-+++ b/src/libstdc++-v3/testsuite/29_atomics/atomic/65913.cc
-@@ -15,7 +15,8 @@
- // with this library; see the file COPYING3.  If not see
- // <http://www.gnu.org/licenses/>.
- 
--// { dg-do run { target x86_64-*-linux* powerpc*-*-linux* } }
-+// { dg-do run }
-+// { dg-require-atomic-builtins "" }
- // { dg-options "-std=gnu++11 -O0" }
- 
- #include <atomic>